Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

App Files Files Community

Priyanka-Kumavat-At-TE commited on Apr 17, 2023

Commit

2fc2c1f

•

1 Parent(s): 3eb0b43

Upload 19 files

Browse files

Files changed (19) hide show

supv/__init__.py +0 -0
supv/bacl.py +493 -0
supv/basic_nn.py +293 -0
supv/fftn.py +240 -0
supv/gbt.py +482 -0
supv/gcn.py +444 -0
supv/knn.py +106 -0
supv/lrd.py +112 -0
supv/lstm.py +414 -0
supv/mcalib.py +384 -0
supv/mcclf.py +207 -0
supv/nlm.py +434 -0
supv/optunar.py +127 -0
supv/pasearch.py +243 -0
supv/regress.py +253 -0
supv/rf.py +134 -0
supv/svm.py +141 -0
supv/svml.py +428 -0
supv/tnn.py +789 -0

supv/__init__.py ADDED Viewed

File without changes

supv/bacl.py ADDED Viewed

	@@ -0,0 +1,493 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from io import StringIO
+from sklearn.model_selection import cross_val_score
+import joblib
+from random import randint
+from io import StringIO
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+#base classifier class
+class BaseClassifier(object):
+	def __init__(self, configFile, defValues, mname):
+		self.config = Configuration(configFile, defValues)
+		self.subSampleRate  = None
+		self.featData = None
+		self.clsData = None
+		self.classifier = None
+		self.trained = False
+		self.verbose = self.config.getBooleanConfig("common.verbose")[0]
+		logFilePath = self.config.getStringConfig("common.logging.file")[0]
+		logLevName = self.config.getStringConfig("common.logging.level")[0]
+		self.logger = createLogger(mname, logFilePath, logLevName)
+		self.logger.info("********* starting session")
+	def initConfig(self, configFile, defValues):
+		"""
+		initialize config
+		"""
+		self.config = Configuration(configFile, defValues)
+	def getConfig(self):
+		"""
+		get config object
+		"""
+		return self.config
+	def setConfigParam(self, name, value):
+		"""
+		set config param
+		"""
+		self.config.setParam(name, value)
+	def getMode(self):
+		"""
+		get mode
+		"""
+		return self.config.getStringConfig("common.mode")[0]
+	def getSearchParamStrategy(self):
+		"""
+		get search parameter
+		"""
+		return self.config.getStringConfig("train.search.param.strategy")[0]
+	def train(self):
+		"""
+		train model
+		"""
+		#build model
+		self.buildModel()
+		# training data
+		if self.featData is None:
+			(featData, clsData) = self.prepTrainingData()
+			(self.featData, self.clsData) = (featData, clsData)
+		else:
+			(featData, clsData) = (self.featData, self.clsData)
+		if self.subSampleRate is not None:
+			(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
+			self.logger.info("subsample size  " + str(featData.shape[0]))
+		# parameters
+		modelSave = self.config.getBooleanConfig("train.model.save")[0]
+		#train
+		self.logger.info("...training model")
+		self.classifier.fit(featData, clsData)
+		score = self.classifier.score(featData, clsData)
+		successCriterion = self.config.getStringConfig("train.success.criterion")[0]
+		result = None
+		if successCriterion == "accuracy":
+			self.logger.info("accuracy with training data {:06.3f}".format(score))
+			result = score
+		elif successCriterion == "error":
+			error = 1.0 - score
+			self.logger.info("error with training data {:06.3f}".format(error))
+			result = error
+		else:
+			raise ValueError("invalid success criterion")
+		if modelSave:
+			self.logger.info("...saving model")
+			modelFilePath = self.getModelFilePath()
+			joblib.dump(self.classifier, modelFilePath)
+		self.trained = True
+		return result
+	def trainValidate(self):
+		"""
+		train with k fold validation
+		"""
+		#build model
+		self.buildModel()
+		# training data
+		(featData, clsData) = self.prepTrainingData()
+		#parameter
+		validation = self.config.getStringConfig("train.validation")[0]
+		numFolds = self.config.getIntConfig("train.num.folds")[0]
+		successCriterion = self.config.getStringConfig("train.success.criterion")[0]
+		scoreMethod = self.config.getStringConfig("train.score.method")[0]
+		#train with validation
+		self.logger.info("...training and kfold cross validating model")
+		scores = cross_val_score(self.classifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
+		avScore = np.mean(scores)
+		result = self.reportResult(avScore, successCriterion, scoreMethod)
+		return result
+	def trainValidateSearch(self):
+		"""
+		train with k fold validation and search parameter space for optimum
+		"""
+		self.logger.info("...starting train validate with parameter search")
+		searchStrategyName = self.getSearchParamStrategy()
+		if searchStrategyName is not None:
+			if searchStrategyName == "grid":
+				searchStrategy = GuidedParameterSearch(self.verbose)
+			elif searchStrategyName == "random":
+				searchStrategy = RandomParameterSearch(self.verbose)
+				maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
+				searchStrategy.setMaxIter(maxIter)
+			elif searchStrategyName == "simuan":
+				searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
+				maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
+				searchStrategy.setMaxIter(maxIter)
+				temp = self.config.getFloatConfig("train.search.sa.temp")[0]
+				searchStrategy.setTemp(temp)
+				tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
+				searchStrategy.setTempReductionRate(tempRedRate)
+			else:
+				raise ValueError("invalid paramtere search strategy")
+		else:
+			raise ValueError("missing search strategy")
+		# add search params
+		searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
+		searchParamNames = []
+		extSearchParamNames = []
+		if searchParams is not None:
+			for searchParam in searchParams:
+				paramItems = searchParam.split(":")
+				extSearchParamNames.append(paramItems[0])
+				#get rid name component search
+				paramNameItems = paramItems[0].split(".")
+				del paramNameItems[1]
+				paramItems[0] = ".".join(paramNameItems)
+				searchStrategy.addParam(paramItems)
+				searchParamNames.append(paramItems[0])
+		else:
+			raise ValueError("missing search parameter list")
+		# add search param data list for each param
+		for (searchParamName,extSearchParamName)  in zip(searchParamNames,extSearchParamNames):
+			searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
+			searchStrategy.addParamVaues(searchParamName, searchParamData)
+		# train and validate for various param value combination
+		searchStrategy.prepare()
+		paramValues = searchStrategy.nextParamValues()
+		searchResults = []
+		while paramValues is not None:
+			self.logger.info("...next parameter set")
+			paramStr = ""
+			for paramValue in paramValues:
+				self.setConfigParam(paramValue[0], str(paramValue[1]))
+				paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + "  "
+			result = self.trainValidate()
+			searchStrategy.setCost(result)
+			searchResults.append((paramStr, result))
+			paramValues = searchStrategy.nextParamValues()
+		# output
+		self.logger.info("all parameter search results")
+		for searchResult in searchResults:
+			self.logger.info("{}\t{06.3f}".format(searchResult[0], searchResult[1]))
+		self.logger.info("best parameter search result")
+		bestSolution = searchStrategy.getBestSolution()
+		paramStr = ""
+		for paramValue in bestSolution[0]:
+			paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + "  "
+		self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
+		return bestSolution
+	def validate(self):
+		"""
+		predict
+		"""
+		# create model
+		useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
+		if useSavedModel:
+			# load saved model
+			self.logger.info("...loading model")
+			modelFilePath = self.getModelFilePath()
+			self.classifier = joblib.load(modelFilePath)
+		else:
+			# train model
+			if not self.trained:
+				self.train()
+		# prepare test data
+		(featData, clsDataActual) = self.prepValidationData()
+		#predict
+		self.logger.info("...predicting")
+		clsDataPred = self.classifier.predict(featData)
+		self.logger.info("...validating")
+		#print clsData
+		scoreMethod = self.config.getStringConfig("validate.score.method")[0]
+		if scoreMethod == "accuracy":
+			accuracy = sk.metrics.accuracy_score(clsDataActual, clsDataPred)
+			self.logger.info("accuracy:")
+			self.logger.info(accuracy)
+		elif scoreMethod == "confusionMatrix":
+			confMatrx = sk.metrics.confusion_matrix(clsDataActual, clsDataPred)
+			self.logger.info("confusion matrix:")
+			self.logger.info(confMatrx)
+	def predictx(self):
+		"""
+		predict
+		"""
+		# create model
+		self.prepModel()
+		# prepare test data
+		featData = self.prepPredictData()
+		#predict
+		self.logger.info("...predicting")
+		clsData = self.classifier.predict(featData)
+		self.logger.info(clsData)
+	def predict(self, recs=None):
+		"""
+		predict with in memory data
+		"""
+		# create model
+		self.prepModel()
+		#input record
+		if recs:
+			#passed record
+			featData = self.prepStringPredictData(recs)
+			if (featData.ndim == 1):
+				featData = featData.reshape(1, -1)
+		else:
+			#file
+			featData = self.prepPredictData()
+		#predict
+		self.logger.info("...predicting")
+		clsData = self.classifier.predict(featData)
+		return clsData
+	def predictProb(self, recs):
+		"""
+		predict probability with in memory data
+		"""
+		raise ValueError("can not predict class probability")
+	def prepModel(self):
+		"""
+		preparing model
+		"""
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if (useSavedModel and not self.classifier):
+			# load saved model
+			self.logger.info("...loading saved model")
+			modelFilePath = self.getModelFilePath()
+			self.classifier = joblib.load(modelFilePath)
+		else:
+			# train model
+			if not self.trained:
+				self.train()
+	def prepTrainingData(self):
+		"""
+		loads and prepares training data
+		"""
+		# parameters
+		dataFile = self.config.getStringConfig("train.data.file")[0]
+		fieldIndices = self.config.getStringConfig("train.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+		    scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
+		    featData = scaleData(featData, scalingMethod)
+		clsData = extrColumns(data, classFieldIndex)
+		clsData = np.array([int(a) for a in clsData])
+		return (featData, clsData)
+	def prepValidationData(self):
+		"""
+		loads and prepares training data
+		"""
+		# parameters
+		dataFile = self.config.getStringConfig("validate.data.file")[0]
+		fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+		    scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
+		    featData = scaleData(featData, scalingMethod)
+		clsData = extrColumns(data, classFieldIndex)
+		clsData = [int(a) for a in clsData]
+		return (featData, clsData)
+	def prepPredictData(self):
+		"""
+		loads and prepares training data
+		"""
+		# parameters
+		dataFile = self.config.getStringConfig("predict.data.file")[0]
+		if dataFile is None:
+			raise ValueError("missing prediction data file")
+		fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+		    scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
+		    featData = scaleData(featData, scalingMethod)
+		return featData
+	def prepStringPredictData(self, recs):
+		"""
+		prepare string predict data
+		"""
+		frecs = StringIO(recs)
+		featData = np.loadtxt(frecs, delimiter=',')
+		return featData
+	def getModelFilePath(self):
+		"""
+		get model file path
+		"""
+		modelDirectory = self.config.getStringConfig("common.model.directory")[0]
+		modelFile = self.config.getStringConfig("common.model.file")[0]
+		if modelFile is None:
+			raise ValueError("missing model file name")
+		modelFilePath = modelDirectory + "/" + modelFile
+		return modelFilePath
+	def reportResult(self, score, successCriterion, scoreMethod):
+		"""
+		report result
+		"""
+		if successCriterion == "accuracy":
+			self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
+			result = score
+		elif successCriterion == "error":
+			error = 1.0 - score
+			self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
+			result = error
+		else:
+			raise ValueError("invalid success criterion")
+		return result
+	def autoTrain(self):
+		"""
+		auto train
+		"""
+		maxTestErr = self.config.getFloatConfig("train.auto.max.test.error")[0]
+		maxErr = self.config.getFloatConfig("train.auto.max.error")[0]
+		maxErrDiff = self.config.getFloatConfig("train.auto.max.error.diff")[0]
+		self.config.setParam("train.model.save", "False")
+		#train, validate and serach optimum parameter
+		result = self.trainValidateSearch()
+		testError = result[1]
+		#subsample training size to match train size for k fold validation
+		numFolds = self.config.getIntConfig("train.num.folds")[0]
+		self.subSampleRate = float(numFolds - 1) / numFolds
+		#train only with optimum parameter values
+		for paramValue in result[0]:
+			pName = paramValue[0]
+			pValue = paramValue[1]
+			self.logger.info(pName + "  " + pValue)
+			self.setConfigParam(pName, pValue)
+		trainError = self.train()
+		if testError < maxTestErr:
+			# criteria based on test error only
+			self.logger.info("Successfullt trained. Low test error level")
+			status = 1
+		else:
+			# criteria based on bias error and generalization error
+			avError = (trainError + testError) / 2
+			diffError = testError - trainError
+			self.logger.info("Auto training  completed: training error {:06.3f} test error: {:06.3f}".format(trainError, testError))
+			self.logger.info("Average of test and training error: {:06.3f} test and training error diff: {:06.3f}".format(avError, diffError))
+			if diffError > maxErrDiff:
+				# high generalization error
+				if avError > maxErr:
+					# high bias error
+					self.logger.info("High generalization error and high error. Need larger training data set and increased model complexity")
+					status = 4
+				else:
+					# low bias error
+					self.logger.info("High generalization error. Need larger training data set")
+					status = 3
+			else:
+				# low generalization error
+				if avError > maxErr:
+					# high bias error
+					self.logger.info("Converged, but with high error rate. Need to increase model complexity")
+					status = 2
+				else:
+					# low bias error
+					self.logger.info("Successfullt trained. Low generalization error and low bias error level")
+					status = 1
+		if status == 1:
+			#train final model, use all data and save model
+			self.logger.info("...training the final model")
+			self.config.setParam("train.model.save", "True")
+			self.subSampleRate  = None
+			trainError = self.train()
+			self.logger.info("training error in final model {:06.3f}".format(trainError))
+		return status

supv/basic_nn.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/Users/pranab/Tools/anaconda/bin/python
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn
+import sklearn.datasets
+import sklearn.linear_model
+import matplotlib
+if len(sys.argv) != 7:
+	print "usage: <num_hidden_units> <data_set_size> <noise_in_data> <iteration_count> <learning_rate> <training_mode> "
+	sys.exit()
+# number of hidden units
+nn_hdim = int(sys.argv[1])
+# dat set size
+dsize = int(sys.argv[2])
+# noise in training data
+noise_level = float(sys.argv[3])
+# iteration count
+it_count = int(sys.argv[4])
+# learning rate
+epsilon = float(sys.argv[5])
+#training mode
+training_mode = sys.argv[6]
+# validation
+use_validation_data = True
+# Generate a dataset
+#noise_level = 0.20
+#noise_level = 0.01
+vlo = 100
+vup = vlo + dsize / 5
+vsize = vup - vlo
+print "trainig data size %d" %(vsize)
+np.random.seed(0)
+XC, yc = sklearn.datasets.make_moons(dsize, noise=noise_level)
+print "complete data set generated"
+def print_array(X,y):
+	print X
+	print y
+# Generate a validation dataset
+#np.random.seed(0)
+#XV, yv = sklearn.datasets.make_moons(40, noise=0.20)
+#print "validation data set generated"
+XV = XC[vlo:vup:1]
+yv = yc[vlo:vup:1]
+print "validation data generated"
+#print_array(XV, yv)
+X = np.delete(XC, np.s_[vlo:vup:1], 0)
+y = np.delete(yc, np.s_[vlo:vup:1], 0)
+print "training data generated"
+#print_array(X, y)
+print X
+print y
+# Parameters
+num_examples = len(X) # training set size
+nn_input_dim = 2 # input layer dimensionality
+nn_output_dim = 2 # output layer dimensionality
+#training data indices
+tr_data_indices = np.arange(num_examples)
+#print tr_data_indices
+# Gradient descent parameters (I picked these by hand)
+#epsilon = 0.01 # learning rate for gradient descent
+reg_lambda = 0.01 # regularization strength
+# Helper function to evaluate the total loss on the dataset
+def calculate_loss(X,y,model):
+    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
+    size = len(X)
+    # Forward propagation to calculate our predictions
+    z1 = X.dot(W1) + b1
+    a1 = np.tanh(z1)
+    z2 = a1.dot(W2) + b2
+    exp_scores = np.exp(z2)
+    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+    # Calculating the loss
+    corect_logprobs = -np.log(probs[range(size), y])
+    data_loss = np.sum(corect_logprobs)
+    # Add regulatization term to loss (optional)
+    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
+    return 1./size * data_loss
+# Helper function to predict an output (0 or 1)
+def predict(model, x):
+    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
+    # Forward propagation
+    z1 = x.dot(W1) + b1
+    a1 = np.tanh(z1)
+    z2 = a1.dot(W2) + b2
+    exp_scores = np.exp(z2)
+    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+    return np.argmax(probs, axis=1)
+# This function learns parameters for the neural network in batch mode and returns the model.
+# - nn_hdim: Number of nodes in the hidden layer
+# - num_passes: Number of passes through the training data for gradient descent
+# - print_loss: If True, print the loss every 1000 iterations
+def build_model_batch(nn_hdim, num_passes=10000, validation_interval=50):
+    # Initialize the parameters to random values. We need to learn these.
+	np.random.seed(0)
+	W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
+	b1 = np.zeros((1, nn_hdim))
+	W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
+	b2 = np.zeros((1, nn_output_dim))
+    # This is what we return at the end
+	model = {}
+    # Gradient descent. For each batch...
+	loss = -1.0
+	for i in xrange(0, num_passes):
+		#print "pass %d" %(i)
+		# Forward propagation
+		z1 = X.dot(W1) + b1
+		a1 = np.tanh(z1)
+		z2 = a1.dot(W2) + b2
+		exp_scores = np.exp(z2)
+		probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+        # Back propagation
+		delta3 = probs
+		delta3[range(num_examples), y] -= 1
+		dW2 = (a1.T).dot(delta3)
+		db2 = np.sum(delta3, axis=0, keepdims=True)
+		delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
+		dW1 = np.dot(X.T, delta2)
+		db1 = np.sum(delta2, axis=0)
+        # Add regularization terms (b1 and b2 don't have regularization terms)
+		dW2 += reg_lambda * W2
+		dW1 += reg_lambda * W1
+        # Gradient descent parameter update
+		W1 += -epsilon * dW1
+		b1 += -epsilon * db1
+		W2 += -epsilon * dW2
+		b2 += -epsilon * db2
+        # Assign new parameters to the model
+		model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
+        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
+		if i % validation_interval == 0:
+			if use_validation_data:
+				cur_loss = calculate_loss(XV,yv,model)
+			else:
+				cur_loss = calculate_loss(X,y,model)
+			print "Loss after iteration %i: %.8f" %(i, cur_loss)
+			loss = cur_loss
+	return model
+# This function learns parameters for the neural network in incremental and returns the model.
+# - nn_hdim: Number of nodes in the hidden layer
+# - num_passes: Number of passes through the training data for gradient descent
+# - print_loss: If True, print the loss every 1000 iterations
+def build_model_incr(nn_hdim, num_passes=10000, validation_interval=50):
+    # Initialize the parameters to random values. We need to learn these.
+	np.random.seed(0)
+	W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
+	b1 = np.zeros((1, nn_hdim))
+	W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
+	b2 = np.zeros((1, nn_output_dim))
+    # This is what we return at the end
+	model = {}
+    # gradient descent. For each batch...
+	loss = -1.0
+	for i in xrange(0, num_passes):
+		#print "pass %d" %(i)
+		#shuffle training data indices
+		np.random.shuffle(tr_data_indices)
+		# all training data
+		for j in tr_data_indices:
+			Xi = X[j].reshape(1,2)
+			yi = y[j].reshape(1)
+			# Forward propagation
+			z1 = Xi.dot(W1) + b1
+			a1 = np.tanh(z1)
+			z2 = a1.dot(W2) + b2
+			exp_scores = np.exp(z2)
+			probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+        	# Back propagation
+			delta3 = probs
+			delta3[0,yi] -= 1
+			dW2 = (a1.T).dot(delta3)
+			db2 = np.sum(delta3, axis=0, keepdims=True)
+			delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
+			dW1 = np.dot(Xi.T, delta2)
+			db1 = np.sum(delta2, axis=0)
+        	# Add regularization terms (b1 and b2 don't have regularization terms)
+			dW2 += reg_lambda * W2
+			dW1 += reg_lambda * W1
+        	# Gradient descent parameter update
+			W1 += -epsilon * dW1
+			b1 += -epsilon * db1
+			W2 += -epsilon * dW2
+			b2 += -epsilon * db2
+        	# Assign new parameters to the model
+			model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
+        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
+		if i % validation_interval == 0:
+			if use_validation_data:
+				cur_loss = calculate_loss(XV,yv,model)
+			else:
+				cur_loss = calculate_loss(X,y,model)
+			print "Loss after iteration %i: %.8f" %(i, cur_loss)
+			loss = cur_loss
+	return model
+# Build a model with a 3-dimensional hidden layer
+if (training_mode == "batch"):
+	model = build_model_batch(nn_hdim, num_passes=it_count, validation_interval=1)
+elif (training_mode == "incr"):
+	model = build_model_incr(nn_hdim, num_passes=it_count, validation_interval=1)
+else:
+	print "invalid learning mode"
+	sys.exit()
+print "hidden layer"
+for row in model['W1']:
+	print(row)
+print "hidden layer bias"
+for row in model['b1']:
+	print(row)
+print "output layer"
+for row in model['W2']:
+	print(row)
+print "output layer bias"
+for row in model['b2']:
+	print(row)

supv/fftn.py ADDED Viewed

	@@ -0,0 +1,240 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch.autograd import Variable
+from torch.utils.data import Dataset, TensorDataset
+from torch.utils.data import DataLoader
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from tnn import *
+class FeedForwardTwinNetwork(FeedForwardNetwork):
+	"""
+	siamese twin feef forward network
+	"""
+	def __init__(self, configFile):
+		defValues = dict()
+		defValues["train.twin.crossenc"] = (False, None)
+		super(FeedForwardTwinNetwork, self).__init__(configFile, defValues)
+	def buildModel(self):
+		"""
+    	Loads configuration and builds the various piecess necessary for the model
+		"""
+		super().buildModel()
+		#final fully connected after merge
+		feCount = self.config.getIntConfig("train.input.size")[0]
+		self.vaFe1 = self.validFeatData[:,:feCount]
+		self.vaFe2 = self.validFeatData[:,feCount:2*feCount]
+		self.vaFe3 = self.validFeatData[:,2*feCount:]
+	def forward(self, x1, x2, x3):
+		"""
+    	Go through layers twice
+		"""
+		y1 = self.layers(x1)
+		y2 = self.layers(x2)
+		y3 = self.layers(x3)
+		y = (y1, y2, y3)
+		return y
+	@staticmethod
+	def batchTrain(model):
+		"""
+		train with batch data
+		"""
+		feCount = model.config.getIntConfig("train.input.size")[0]
+		fe1 = model.featData[:,:feCount]
+		fe2 = model.featData[:,feCount:2*feCount]
+		fe3 = model.featData[:,2*feCount:]
+		print(fe1.shape)
+		print(fe2.shape)
+		print(fe3.shape)
+		trainData = TensorDataset(fe1, fe2, fe3)
+		trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
+		epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
+		# train mode
+		model.train()
+		if model.trackErr:
+			trErr = list()
+			vaErr = list()
+		#epoch
+		for t in range(model.numIter):
+			#batch
+			b = 0
+			epochLoss = 0.0
+			for x1Batch, x2Batch, x3Batch in trainDataLoader:
+				# Forward pass: Compute predicted y by passing x to the model
+				yPred = model(x1Batch, x2Batch, x3Batch)
+				# Compute and print loss
+				loss = model.lossFn(yPred[0], yPred[1], yPred[2])
+				if model.verbose and t % epochIntv == 0 and model.batchIntv > 0 and b % model.batchIntv == 0:
+					print("epoch {}  batch {}  loss {:.6f}".format(t, b, loss.item()))
+				if model.trackErr and model.batchIntv == 0:
+					epochLoss += loss.item()
+				#error tracking at batch level
+				if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
+					trErr.append(loss.item())
+					vloss = FeedForwardTwinNetwork.evaluateModel(model)
+					vaErr.append(vloss)
+				# Zero gradients, perform a backward pass, and update the weights.
+				model.optimizer.zero_grad()
+				loss.backward()
+				model.optimizer.step()
+				b += 1
+			#error tracking at epoch level
+			if model.trackErr and model.batchIntv == 0:
+				epochLoss /= b
+				if model.verbose:
+					print("epoch {}  loss {:.6f}".format(t, epochLoss))
+				trErr.append(epochLoss)
+				vloss = FeedForwardTwinNetwork.evaluateModel(model)
+				vaErr.append(vloss)
+		#validate
+		"""
+		model.eval()
+		yPred = model(model.vaFeOne, model.vaFeTwo)
+		yPred = yPred.data.cpu().numpy()
+		yActual = model.validOutData.data.cpu().numpy()
+		if model.verbose:
+			vsize = yPred.shape[0]
+			print("\npredicted \t\t actual")
+			for i in range(vsize):
+				print(str(yPred[i]) + "\t" + str(yActual[i]))
+		score = perfMetric(model.accMetric, yActual, yPred)
+		print(yActual)
+		print(yPred)
+		print(formatFloat(3, score, "perf score"))
+		"""
+		#save
+		modelSave = model.config.getBooleanConfig("train.model.save")[0]
+		if modelSave:
+			FeedForwardNetwork.saveCheckpt(model)
+		if model.trackErr:
+			FeedForwardNetwork.errorPlot(model, trErr, vaErr)
+		return 1.0
+	@staticmethod
+	def evaluateModel(model):
+		"""
+		evaluate model
+		Parameters
+			model : torch model
+		"""
+		model.eval()
+		with torch.no_grad():
+			yPred = model(model.vaFe1, model.vaFe2, model.vaFe3)
+			score = model.lossFn(yPred[0], yPred[1], yPred[2]).item()
+		model.train()
+		return score
+	@staticmethod
+	def testModel(model):
+		"""
+		test model
+		Parameters
+			model : torch model
+		"""
+		useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
+		if useSavedModel:
+			FeedForwardNetwork.restoreCheckpt(model)
+		else:
+			FeedForwardTwinNetwork.batchTrain(model)
+		dataSource = model.config.getStringConfig("predict.data.file")[0]
+		featData = FeedForwardNetwork.prepData(model, dataSource, False)
+		featData = torch.from_numpy(featData)
+		feCount = model.config.getIntConfig("train.input.size")[0]
+		fe1 = featData[:,:feCount]
+		fe2 = featData[:,feCount:2*feCount]
+		fe3 = featData[:,2*feCount:]
+		model.eval()
+		with torch.no_grad():
+			yp = model(fe1, fe2, fe3)
+			cos = torch.nn.CosineSimilarity()
+			s1 = cos(yp[0], yp[1]).data.cpu().numpy()
+			s2 = cos(yp[0], yp[2]).data.cpu().numpy()
+			#print(s1.shape)
+			n = yp[0].shape[0]
+			if model.verbose:
+				print(n)
+				for i in range(15):
+					if i % 3 == 0:
+						print("next")
+					print(yp[0][i])
+					print(yp[1][i])
+					print(yp[2][i])
+					print("similarity  {:.3f}  {:.3f}".format(s1[i], s2[i]))
+			tc = 0
+			cc = 0
+			outputSize = model.config.getIntConfig("train.output.size")[0]
+			for i in range(0, n, outputSize):
+				#for each sample outputSize no of rows
+				msi = None
+				imsi = None
+				for j in range(outputSize):
+					#first one positive , followed by all negative
+					si = (s1[i+j] + s2[i+j]) / 2
+					if msi == None or si > msi:
+						msi = si
+						imsi = j
+				tc += 1
+				if imsi == 0:
+					cc += 1
+		score = cc / tc
+		print("score: {:.3f}".format(score))
+		model.train()
+		return score

supv/gbt.py ADDED Viewed

	@@ -0,0 +1,482 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from sklearn.ensemble import GradientBoostingClassifier
+import joblib
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import cross_val_score
+from random import randint
+from io import StringIO
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+from bacl import *
+# gradient boosting classification
+class GradientBoostedTrees(object):
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["common.mode"] = ("training", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.class.field"] = (None, "missing class field ordinal")
+		defValues["train.validation"] = ("kfold", None)
+		defValues["train.num.folds"] = (5, None)
+		defValues["train.min.samples.split"] = ("4", None)
+		defValues["train.min.samples.leaf.gb"] = ("2", None)
+		defValues["train.max.depth.gb"] = (3, None)
+		defValues["train.max.leaf.nodes.gb"] = (None, None)
+		defValues["train.max.features.gb"] = (None, None)
+		defValues["train.learning.rate"] = (0.1, None)
+		defValues["train.num.estimators.gb"] = (100, None)
+		defValues["train.subsample"] = (1.0, None)
+		defValues["train.loss"] = ("deviance", None)
+		defValues["train.random.state"] = (None, None)
+		defValues["train.verbose"] = (0, None)
+		defValues["train.warm.start"] = (False, None)
+		defValues["train.presort"] = ("auto", None)
+		defValues["train.criterion"] = ("friedman_mse", None)
+		defValues["train.success.criterion"] = ("error", None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["train.search.param.strategy"] = (None, None)
+		defValues["train.search.params"] = (None, None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.data.fields"] = (None, "missing data field ordinals")
+		defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
+		defValues["predict.use.saved.model"] = (False, None)
+		defValues["validate.data.file"] = (None, "missing validation data file")
+		defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
+		defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
+		defValues["validate.data.class.field"] = (None, "missing class field ordinal")
+		defValues["validate.use.saved.model"] = (False, None)
+		defValues["validate.score.method"] = ("accuracy", None)
+		self.config = Configuration(configFile, defValues)
+		self.subSampleRate  = None
+		self.featData = None
+		self.clsData = None
+		self.gbcClassifier = None
+		self.verbose = self.config.getBooleanConfig("common.verbose")[0]
+		logFilePath = self.config.getStringConfig("common.logging.file")[0]
+		logLevName = self.config.getStringConfig("common.logging.level")[0]
+		self.logger = createLogger(__name__, logFilePath, logLevName)
+		self.logger.info("********* starting session")
+	# initialize config
+	def initConfig(self, configFile, defValues):
+		self.config = Configuration(configFile, defValues)
+	# get config object
+	def getConfig(self):
+		return self.config
+	#set config param
+	def setConfigParam(self, name, value):
+		self.config.setParam(name, value)
+	#get mode
+	def getMode(self):
+		return self.config.getStringConfig("common.mode")[0]
+	#get search parameter
+	def getSearchParamStrategy(self):
+		return self.config.getStringConfig("train.search.param.strategy")[0]
+	def setModel(self, model):
+		self.gbcClassifier = model
+	# train model
+	def train(self):
+		#build model
+		self.buildModel()
+		# training data
+		if self.featData is None:
+			(featData, clsData) = self.prepTrainingData()
+			(self.featData, self.clsData) = (featData, clsData)
+		else:
+			(featData, clsData) = (self.featData, self.clsData)
+		if self.subSampleRate is not None:
+			(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
+			self.logger.info("subsample size  " + str(featData.shape[0]))
+		# parameters
+		modelSave = self.config.getBooleanConfig("train.model.save")[0]
+		#train
+		self.logger.info("...training model")
+		self.gbcClassifier.fit(featData, clsData)
+		score = self.gbcClassifier.score(featData, clsData)
+		successCriterion = self.config.getStringConfig("train.success.criterion")[0]
+		result = None
+		if successCriterion == "accuracy":
+			self.logger.info("accuracy with training data {:06.3f}".format(score))
+			result = score
+		elif successCriterion == "error":
+			error = 1.0 - score
+			self.logger.info("error with training data {:06.3f}".format(error))
+			result = error
+		else:
+			raise ValueError("invalid success criterion")
+		if modelSave:
+			self.logger.info("...saving model")
+			modelFilePath = self.getModelFilePath()
+			joblib.dump(self.gbcClassifier, modelFilePath)
+		return result
+	#train with k fold validation
+	def trainValidate(self):
+		#build model
+		self.buildModel()
+		# training data
+		(featData, clsData) = self.prepTrainingData()
+		#parameter
+		validation = self.config.getStringConfig("train.validation")[0]
+		numFolds = self.config.getIntConfig("train.num.folds")[0]
+		successCriterion = self.config.getStringConfig("train.success.criterion")[0]
+		scoreMethod = self.config.getStringConfig("train.score.method")[0]
+		#train with validation
+		self.logger.info("...training and kfold cross validating model")
+		scores = cross_val_score(self.gbcClassifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
+		avScore = np.mean(scores)
+		result = self.reportResult(avScore, successCriterion, scoreMethod)
+		return result
+	#train with k fold validation and search parameter space for optimum
+	def trainValidateSearch(self):
+		self.logger.info("...starting train validate with parameter search")
+		searchStrategyName = self.getSearchParamStrategy()
+		if searchStrategyName is not None:
+			if searchStrategyName == "grid":
+				searchStrategy = GuidedParameterSearch(self.verbose)
+			elif searchStrategyName == "random":
+				searchStrategy = RandomParameterSearch(self.verbose)
+				maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
+				searchStrategy.setMaxIter(maxIter)
+			elif searchStrategyName == "simuan":
+				searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
+				maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
+				searchStrategy.setMaxIter(maxIter)
+				temp = self.config.getFloatConfig("train.search.sa.temp")[0]
+				searchStrategy.setTemp(temp)
+				tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
+				searchStrategy.setTempReductionRate(tempRedRate)
+			else:
+				raise ValueError("invalid paramtere search strategy")
+		else:
+			raise ValueError("missing search strategy")
+		# add search params
+		searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
+		searchParamNames = []
+		extSearchParamNames = []
+		if searchParams is not None:
+			for searchParam in searchParams:
+				paramItems = searchParam.split(":")
+				extSearchParamNames.append(paramItems[0])
+				#get rid name component search
+				paramNameItems = paramItems[0].split(".")
+				del paramNameItems[1]
+				paramItems[0] = ".".join(paramNameItems)
+				searchStrategy.addParam(paramItems)
+				searchParamNames.append(paramItems[0])
+		else:
+			raise ValueError("missing search parameter list")
+		# add search param data list for each param
+		for (searchParamName,extSearchParamName)  in zip(searchParamNames,extSearchParamNames):
+			searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
+			searchStrategy.addParamVaues(searchParamName, searchParamData)
+		# train and validate for various param value combination
+		searchStrategy.prepare()
+		paramValues = searchStrategy.nextParamValues()
+		searchResults = []
+		while paramValues is not None:
+			self.logger.info("...next parameter set")
+			paramStr = ""
+			for paramValue in paramValues:
+				self.setConfigParam(paramValue[0], str(paramValue[1]))
+				paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + "  "
+			result = self.trainValidate()
+			searchStrategy.setCost(result)
+			searchResults.append((paramStr, result))
+			paramValues = searchStrategy.nextParamValues()
+		# output
+		self.logger.info("all parameter search results")
+		for searchResult in searchResults:
+			self.logger.info("{}\t{:06.3f}".format(searchResult[0], searchResult[1]))
+		self.logger.info("best parameter search result")
+		bestSolution = searchStrategy.getBestSolution()
+		paramStr = ""
+		for paramValue in bestSolution[0]:
+			paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + "  "
+		self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
+		return bestSolution
+	#predict
+	def validate(self):
+		# create model
+		useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
+		if useSavedModel:
+			# load saved model
+			self.logger.info("...loading model")
+			modelFilePath = self.getModelFilePath()
+			self.gbcClassifier = joblib.load(modelFilePath)
+		else:
+			# train model
+			self.train()
+		# prepare test data
+		(featData, clsDataActual) = self.prepValidationData()
+		#predict
+		self.logger.info("...predicting")
+		clsDataPred = self.gbcClassifier.predict(featData)
+		self.logger.info("...validating")
+		#self.logger.info(clsData)
+		scoreMethod = self.config.getStringConfig("validate.score.method")[0]
+		if scoreMethod == "accuracy":
+			accuracy = accuracy_score(clsDataActual, clsDataPred)
+			self.logger.info("accuracy:")
+			self.logger.info(accuracy)
+		elif scoreMethod == "confusionMatrix":
+			confMatrx = confusion_matrix(clsDataActual, clsDataPred)
+			self.logger.info("confusion matrix:")
+			self.logger.info(confMatrx)
+	#predict
+	def predictx(self):
+		# create model
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if useSavedModel:
+			# load saved model
+			self.logger.info("...loading model")
+			modelFilePath = self.getModelFilePath()
+			self.gbcClassifier = joblib.load(modelFilePath)
+		else:
+			# train model
+			self.train()
+		# prepare test data
+		featData = self.prepPredictData()
+		#predict
+		self.logger.info("...predicting")
+		clsData = self.gbcClassifier.predict(featData)
+		self.logger.info(clsData)
+	#predict with in memory data
+	def predict(self, recs=None):
+		# create model
+		self.prepModel()
+		#input record
+		#input record
+		if recs:
+			#passed record
+			featData = self.prepStringPredictData(recs)
+			if (featData.ndim == 1):
+				featData = featData.reshape(1, -1)
+		else:
+			#file
+			featData = self.prepPredictData()
+		#predict
+		self.logger.info("...predicting")
+		clsData = self.gbcClassifier.predict(featData)
+		return clsData
+	#predict probability with in memory data
+	def predictProb(self, recs):
+		# create model
+		self.prepModel()
+		#input record
+		if type(recs) is str:
+			featData = self.prepStringPredictData(recs)
+		else:
+			featData = recs
+		#self.logger.info(featData.shape)
+		if (featData.ndim == 1):
+			featData = featData.reshape(1, -1)
+		#predict
+		self.logger.info("...predicting class probability")
+		clsData = self.gbcClassifier.predict_proba(featData)
+		return clsData
+	#preparing model
+	def prepModel(self):
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if (useSavedModel and not self.gbcClassifier):
+			# load saved model
+			self.logger.info("...loading saved model")
+			modelFilePath = self.getModelFilePath()
+			self.gbcClassifier = joblib.load(modelFilePath)
+		else:
+			# train model
+			self.train()
+		return self.gbcClassifier
+	#prepare string predict data
+	def prepStringPredictData(self, recs):
+		frecs = StringIO(recs)
+		featData = np.loadtxt(frecs, delimiter=',')
+		#self.logger.info(featData)
+		return featData
+	#loads and prepares training data
+	def prepTrainingData(self):
+		# parameters
+		dataFile = self.config.getStringConfig("train.data.file")[0]
+		fieldIndices = self.config.getStringConfig("train.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		clsData = extrColumns(data, classFieldIndex)
+		clsData = np.array([int(a) for a in clsData])
+		return (featData, clsData)
+	#loads and prepares training data
+	def prepValidationData(self):
+		# parameters
+		dataFile = self.config.getStringConfig("validate.data.file")[0]
+		fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		clsData = extrColumns(data, classFieldIndex)
+		clsData = [int(a) for a in clsData]
+		return (featData, clsData)
+	#loads and prepares training data
+	def prepPredictData(self):
+		# parameters
+		dataFile = self.config.getStringConfig("predict.data.file")[0]
+		if dataFile is None:
+			raise ValueError("missing prediction data file")
+		fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		#training data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		return featData
+	# get model file path
+	def getModelFilePath(self):
+		modelDirectory = self.config.getStringConfig("common.model.directory")[0]
+		modelFile = self.config.getStringConfig("common.model.file")[0]
+		if modelFile is None:
+			raise ValueError("missing model file name")
+		modelFilePath = modelDirectory + "/" + modelFile
+		return modelFilePath
+	# report result
+	def reportResult(self, score, successCriterion, scoreMethod):
+		if successCriterion == "accuracy":
+			self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
+			result = score
+		elif successCriterion == "error":
+			error = 1.0 - score
+			self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
+			result = error
+		else:
+			raise ValueError("invalid success criterion")
+		return result
+	# builds model object
+	def buildModel(self):
+		self.logger.info("...building gradient boosted tree model")
+		# parameters
+		minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
+		minSamplesSplit = typedValue(minSamplesSplit)
+		minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf.gb")[0]
+		minSamplesLeaf = typedValue(minSamplesLeaf)
+		#minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf.gb")[0]
+		(maxDepth, maxLeafNodes) = self.config.eitherOrIntConfig("train.max.depth.gb", "train.max.leaf.nodes.gb")
+		maxFeatures = self.config.getStringConfig("train.max.features.gb")[0]
+		maxFeatures = typedValue(maxFeatures)
+		learningRate = self.config.getFloatConfig("train.learning.rate")[0]
+		numEstimators = self.config.getIntConfig("train.num.estimators.gb")[0]
+		subsampleFraction = self.config.getFloatConfig("train.subsample")[0]
+		lossFun = self.config.getStringConfig("train.loss")[0]
+		randomState = self.config.getIntConfig("train.random.state")[0]
+		verboseOutput = self.config.getIntConfig("train.verbose")[0]
+		warmStart = self.config.getBooleanConfig("train.warm.start")[0]
+		presort = self.config.getStringConfig("train.presort")
+		if (presort[1]):
+			presortChoice = presort[0]
+		else:
+			presortChoice = presort[0].lower() == "true"
+		splitCriterion = self.config.getStringConfig("train.criterion")[0]
+		#classifier
+		self.gbcClassifier = GradientBoostingClassifier(loss=lossFun, learning_rate=learningRate, n_estimators=numEstimators,
+		subsample=subsampleFraction, min_samples_split=minSamplesSplit,
+		min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=0.0, max_depth=maxDepth,
+		init=None, random_state=randomState, max_features=maxFeatures, verbose=verboseOutput,
+		max_leaf_nodes=maxLeafNodes, warm_start=warmStart, presort=presortChoice)

supv/gcn.py ADDED Viewed

	@@ -0,0 +1,444 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import matplotlib
+import random
+from random import randint
+from itertools import compress
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import Linear
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from torch_geometric.nn import GCNConv
+from torch_geometric.nn import MessagePassing
+from torch_geometric.data import Data
+import sklearn as sk
+import jprops
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from tnn import FeedForwardNetwork
+"""
+Graph convolution network
+"""
+class GraphConvoNetwork(nn.Module):
+    def __init__(self, configFile):
+    	"""
+    	initilizer
+		Parameters
+			configFile : config file path
+    	"""
+    	defValues = dict()
+    	defValues["common.model.directory"] = ("model", None)
+    	defValues["common.model.file"] = (None, None)
+    	defValues["common.preprocessing"] = (None, None)
+    	defValues["common.scaling.method"] = ("zscale", None)
+    	defValues["common.scaling.minrows"] = (50, None)
+    	defValues["common.scaling.param.file"] = (None, None)
+    	defValues["common.verbose"] = (False, None)
+    	defValues["common.device"] = ("cpu", None)
+    	defValues["train.data.file"] = (None, "missing training data file")
+    	defValues["train.data.num.nodes.total"] = (None, None)
+    	defValues["train.data.num.nodes.training"] = (None, None)
+    	defValues["train.data.splits"] = ([.75,.15,.10], None)
+    	defValues["train.layer.data"] = (None, "missing layer data")
+    	defValues["train.input.size"] = (None, "missing  output size")
+    	defValues["train.output.size"] = (None, "missing  output size")
+    	defValues["train.loss.reduction"] = ("mean", None)
+    	defValues["train.num.iterations"] = (500, None)
+    	defValues["train.lossFn"] = ("mse", None)
+    	defValues["train.optimizer"] = ("sgd", None)
+    	defValues["train.opt.learning.rate"] = (.0001, None)
+    	defValues["train.opt.weight.decay"] = (0, None)
+    	defValues["train.opt.momentum"] = (0, None)
+    	defValues["train.opt.eps"] = (1e-08, None)
+    	defValues["train.opt.dampening"] = (0, None)
+    	defValues["train.opt.momentum.nesterov"] = (False, None)
+    	defValues["train.opt.betas"] = ([0.9, 0.999], None)
+    	defValues["train.opt.alpha"] = (0.99, None)
+    	defValues["train.save.model"] = (False, None)
+    	defValues["train.track.error"] = (False, None)
+    	defValues["train.epoch.intv"] = (5, None)
+    	defValues["train.print.weights"] = (False, None)
+    	defValues["valid.accuracy.metric"] = (None, None)
+    	defValues["predict.create.mask"] = (False, None)
+    	defValues["predict.use.saved.model"] = (True, None)
+    	self.config = Configuration(configFile, defValues)
+    	super(GraphConvoNetwork, self).__init__()
+    def getConfig(self):
+    	"""
+    	return config
+    	"""
+    	return self.config
+    def buildModel(self):
+    	"""
+    	Loads configuration and builds the various piecess necessary for the model
+    	"""
+    	torch.manual_seed(9999)
+    	self.verbose = self.config.getBooleanConfig("common.verbose")[0]
+    	numinp = self.config.getIntConfig("train.input.size")[0]
+    	self.outputSize = self.config.getIntConfig("train.output.size")[0]
+    	self.numIter = self.config.getIntConfig("train.num.iterations")[0]
+    	optimizer = self.config.getStringConfig("train.optimizer")[0]
+    	self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
+    	self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
+    	self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
+    	self.restored = False
+    	self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
+    	#build network
+    	layers = list()
+    	ninp = numinp
+    	trData =  self.config.getStringConfig("train.layer.data")[0].split(",")
+    	for ld in trData:
+    		lde = ld.split(":")
+    		ne = len(lde)
+    		assert ne == 5 or ne == 6, "expecting 5 or 6 items for layer data"
+    		gconv = False
+    		if ne == 6:
+    			if lde[0] == "gconv":
+    				gconv == True
+    			lde = lde[1:]
+    		#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
+    		nunit = int(lde[0])
+    		actStr = lde[1]
+    		act = FeedForwardNetwork.createActivation(actStr) if actStr != "none"  else None
+    		bnorm = lde[2] == "true"
+    		afterAct = lde[3] == "true"
+    		dpr = float(lde[4])
+    		if gconv:
+    			layers.append(GCNConv(ninp, nunit))
+    		else:
+    			layers.append(Linear(ninp, nunit))
+    		if bnorm:
+    			#with batch norm
+    			if afterAct:
+    				safeAppend(layers, act)
+    				layers.append(torch.nn.BatchNorm1d(nunit))
+    			else:
+    				layers.append(torch.nn.BatchNorm1d(nunit))
+    				safeAppend(layers, act)
+    		else:
+    			#without batch norm
+    			safeAppend(layers, act)
+    		if dpr > 0:
+    			layers.append(torch.nn.Dropout(dpr))
+    		ninp = nunit
+    	self.layers = torch.nn.ModuleList(layers)
+    	self.device = FeedForwardNetwork.getDevice(self)
+    	self.to(self.device)
+    	self.loadData()
+    	self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
+    	self.optimizer =  FeedForwardNetwork.createOptimizer(self, optimizer)
+    	self.trained = False
+    def loadData(self):
+    	"""
+    	load node and edge data
+    	"""
+    	dataFilePath = self.config.getStringConfig("train.data.file")[0]
+    	numNodes = self.config.getIntConfig("train.data.num.nodes.total")[0]
+    	numLabeled = self.config.getIntConfig("train.data.num.nodes.training")[0]
+    	splits = self.config.getFloatListConfig("train.data.splits")[0]
+    	crPredMask = self.config.getBooleanConfig("predict.create.mask")[0]
+    	dx = list()
+    	dy = list()
+    	edges = list()
+    	mask = None
+    	for rec in fileRecGen(dataFilePath, ","):
+    		if len(rec) > 2:
+    			x = rec[1 :-1]
+    			x = toFloatList(x)
+    			y = int(rec[-1])
+    			dx.append(x)
+    			dy.append(y)
+    		elif len(rec) == 2:
+    			e = toIntList(rec)
+    			edges.append(e)
+    		elif len(rec) == 1:
+    			items = rec[0].split()
+    			assertEqual(items[0], "mask", "invalid mask data")
+    			numNodes = int(items[1])
+    			print(numNodes)
+    			mask = list()
+    			for r in range(2, len(items), 1):
+    				ri = items[r].split(":")
+    				#print(ri)
+    				ms = list(range(int(ri[0]), int(ri[1]), 1))
+    				mask.extend(ms)
+    	#scale node features
+    	if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+    		scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
+    		dx = scaleData(dx, scalingMethod)
+    	dx = torch.tensor(dx, dtype=torch.float)
+    	dy = torch.tensor(dy, dtype=torch.long)
+    	edges = torch.tensor(edges, dtype=torch.long)
+    	edges = edges.t().contiguous()
+    	dx = dx.to(self.device)
+    	dy = dy.to(self.device)
+    	edges = edges.to(self.device)
+    	self.data = Data(x=dx, edge_index=edges, y=dy)
+    	#maks
+    	if mask is None:
+    		#trainiug data in the beginning
+    		trStart = 0
+    		vaStart = int(splits[0] * numLabeled)
+    		teStart = vaStart + int(splits[1] * numLabeled)
+    		trMask = [False] * numNodes
+    		trMask[0:vaStart] = [True] * vaStart
+    		vaMask = [False] * numNodes
+    		vaMask[vaStart:teStart] = [True] * (teStart - vaStart)
+    		teMask = [False] * numNodes
+    		teMask[teStart:] = [True] * (numNodes - teStart)
+    	else:
+    		#training data anywhere
+    		if crPredMask:
+    			prMask = [True] * numNodes
+    			for i in mask:
+    				prMask[i] = False
+    		self.prMask = torch.tensor(prMask, dtype=torch.bool)
+    		nshuffle = int(len(mask) / 2)
+    		shuffle(mask, nshuffle)
+    		#print(mask)
+    		lmask = len(mask)
+    		trme = int(splits[0] * lmask)
+    		vame = int((splits[0] + splits[1]) * lmask)
+    		teme = lmask
+    		trMask = [False] * numNodes
+    		for i in mask[:trme]:
+    			trMask[i] = True
+    		vaMask = [False] * numNodes
+    		for i in mask[trme:vame]:
+    			vaMask[i] = True
+    		teMask = [False] * numNodes
+    		for i in mask[vame:]:
+    			teMask[i] = True
+    		#print(vaMask)
+    	trMask = torch.tensor(trMask, dtype=torch.bool)
+    	trMask = trMask.to(self.device)
+    	self.data.train_mask = trMask
+    	vaMask = torch.tensor(vaMask, dtype=torch.bool)
+    	vaMask = vaMask.to(self.device)
+    	self.data.val_mask = vaMask
+    	teMask = torch.tensor(teMask, dtype=torch.bool)
+    	teMask = teMask.to(self.device)
+    	self.data.test_mask = teMask
+    def descData(self):
+    	"""
+    	describe data
+    	"""
+    	print(f'Number of nodes: {self.data.num_nodes}')
+    	print(f'Number of edges: {self.data.num_edges}')
+    	print(f'Number of node features: {self.data.num_node_features}')
+    	print(f'Number of training nodes: {self.data.train_mask.sum()}')
+    	print(f'Training node label rate: {int(self.data.train_mask.sum()) / data.num_nodes:.2f}')
+    	print(f'Number of validation nodes: {self.data.val_mask.sum()}')
+    	print(f'Number of test nodes: {self.data.test_mask.sum()}')
+    	print(f'Is undirected: {self.data.is_undirected()}')
+    	print("Data attributes")
+    	print(self.data.keys)
+    	print("Data types")
+    	print(type(self.data.x))
+    	print(type(self.data.y))
+    	print(type(self.data.edge_index))
+    	print(type(self.data.train_mask))
+    	print("Sample data")
+    	print("x", self.data.x[:4])
+    	print("y", self.data.y[:4])
+    	print("edge", self.data.edge_index[:4])
+    	print("train mask", self.data.train_mask[:4])
+    	print("test mask", self.data.test_mask[:4])
+    	print("Any isolated node? " , self.data.has_isolated_nodes())
+    	print("Any self loop? ", self.data.has_self_loops())
+    	print("Is graph directed? ", self.data.is_directed())
+    def forward(self):
+    	"""
+    	forward prop
+    	"""
+    	x, edges = self.data.x, self.data.edge_index
+    	for l in self.layers:
+    		if isinstance(l, MessagePassing):
+    			x = l(x, edges)
+    		else:
+    			x = l(x)
+    	return x
+    @staticmethod
+    def trainModel(model):
+    	"""
+    	train with batch data
+		Parameters
+			model : torch model
+    	"""
+    	epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
+    	model.train()
+    	if model.trackErr:
+    		trErr = list()
+    		vaErr = list()
+    	for epoch in range(model.numIter):
+    		out = model()
+    		loss = model.lossFn(out[model.data.train_mask], model.data.y[model.data.train_mask])
+    		#error tracking at batch level
+    		if model.trackErr:
+    			trErr.append(loss.item())
+    			vErr = GraphConvoNetwork.evaluateModel(model)
+    			vaErr.append(vErr)
+    			if model.verbose and epoch % epochIntv == 0:
+    				print("epoch {}   loss {:.6f}  val error {:.6f}".format(epoch, loss.item(), vErr))
+    		model.optimizer.zero_grad()
+    		loss.backward()
+    		model.optimizer.step()
+    	#acc = GraphConvoNetwork.evaluateModel(model, True)
+    	#print(acc)
+    	modelSave = model.config.getBooleanConfig("train.model.save")[0]
+    	if modelSave:
+    		FeedForwardNetwork.saveCheckpt(model)
+    	if model.trackErr:
+    		FeedForwardNetwork.errorPlot(model, trErr, vaErr)
+    	model.trained = True
+    @staticmethod
+    def evaluateModel(model, verbose=False):
+    	"""
+    	evaluate model
+		Parameters
+			model : torch model
+			verbose : if True additional output
+    	"""
+    	model.eval()
+    	with torch.no_grad():
+    		out = model()
+    		if verbose:
+    			print(out)
+    		yPred = out[model.data.val_mask].data.cpu().numpy()
+    		yActual = model.data.y[model.data.val_mask].data.cpu().numpy()
+    		if verbose:
+    			for pa in zip(yPred, yActual):
+    				print(pa)
+    		#correct = yPred == yActual
+    		#score = int(correct.sum()) / int(model.data.val_mask.sum())
+    		score = perfMetric(model.lossFnStr, yActual, yPred, model.clabels)
+    	model.train()
+    	return score
+    @staticmethod
+    def validateModel(model, retPred=False):
+    	"""
+		model validation
+		Parameters
+			model : torch model
+			retPred : if True return prediction
+		"""
+    	model.eval()
+    	with torch.no_grad():
+    		out = model()
+    		yPred = out.argmax(dim=1)
+    		yPred = yPred[model.data.test_mask].data.cpu().numpy()
+    		yActual = model.data.y[model.data.test_mask].data.cpu().numpy()
+    		#correct = yPred == yActual
+    		#score = int(correct.sum()) / int(model.data.val_mask.sum())
+    		score = perfMetric(model.accMetric, yActual, yPred)
+    		print(formatFloat(3, score, "test #perf score"))
+    	return score
+    @staticmethod
+    def modelPrediction(model, inclData=True):
+    	"""
+    	make prediction
+		Parameters
+			model : torch model
+    		inclData : True to include input data
+    	"""
+    	cmask = model.config.getBooleanConfig("predict.create.mask")[0]
+    	if not cmask:
+    		print("create prediction mask property needs to be set to True")
+    		return None
+    	useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
+    	if useSavedModel:
+    		FeedForwardNetwork.restoreCheckpt(model)
+    	else:
+    		if not model.trained:
+    			GraphConvoNetwork.trainModel(model)
+    	model.eval()
+    	with torch.no_grad():
+    		out = model()
+    		yPred = out.argmax(dim=1)
+    		yPred = yPred[model.prMask].data.cpu().numpy()
+    	if inclData:
+    		dataFilePath = model.config.getStringConfig("train.data.file")[0]
+    		filt = lambda r : len(r) > 2
+    		ndata = list(fileFiltRecGen(dataFilePath, filt))
+    		prMask = model.prMask.data.cpu().numpy()
+    		assertEqual(len(ndata), prMask.shape[0], "data and mask lengths are not equal")
+    		precs = list(compress(ndata, prMask))
+    		precs = list(map(lambda r : r[:-1], precs))
+    		assertEqual(len(precs), yPred.shape[0], "data and mask lengths are not equal")
+    		res =  zip(precs, yPred)
+    	else:
+    		res = yPred
+    	return res

supv/knn.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from sklearn.neighbors import KNeighborsClassifier
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from bacl import *
+# gradient boosting classification
+class NearestNeighbor(BaseClassifier):
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["common.mode"] = ("training", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.scaling.method"] = ("zscale", None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.class.field"] = (None, "missing class field ordinal")
+		defValues["train.num.neighbors"] = (5, None)
+		defValues["train.neighbor.weight"] = ("uniform", None)
+		defValues["train.neighbor.search.algo"] = ("auto", None)
+		defValues["train.neighbor.search.leaf.size"] = (10, None)
+		defValues["train.neighbor.dist.metric"] = ("minkowski", None)
+		defValues["train.neighbor.dist.metric.pow"] = (2.0, None)
+		defValues["train.success.criterion"] = ("error", None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.data.fields"] = (None, "missing data field ordinals")
+		defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
+		defValues["predict.use.saved.model"] = (False, None)
+		super(NearestNeighbor, self).__init__(configFile, defValues, __name__)
+	def buildModel(self):
+		"""
+		builds model object
+		"""
+		self.logger.info("...building knn classifer model")
+		numNeighbors = self.config.getIntConfig("train.num.neighbors")[0]
+		neighborWeight = self.config.getStringConfig("train.neighbor.weight")[0]
+		searchAlgo = self.config.getStringConfig("train.neighbor.search.algo")[0]
+		leafSize = self.config.getIntConfig("train.neighbor.search.leaf.size")[0]
+		distMetric = self.config.getStringConfig("train.neighbor.dist.metric")[0]
+		metricPow = self.config.getIntConfig("train.neighbor.dist.metric.pow")[0]
+		model = KNeighborsClassifier(n_neighbors=numNeighbors, weights=neighborWeight, algorithm=searchAlgo,
+		leaf_size=30, p=metricPow, metric=distMetric)
+		self.classifier = model
+		return self.classifier
+	def predictProb(self, recs=None):
+		"""
+		predict probability
+		"""
+		# create model
+		self.prepModel()
+		#input record
+		if recs is None:
+			featData = self.prepPredictData()
+		else:
+			if type(recs) is str:
+				featData = self.prepStringPredictData(recs)
+			else:
+				featData = recs
+			if (featData.ndim == 1):
+				featData = featData.reshape(1, -1)
+		#predict
+		self.logger.info("...predicting class probability")
+		clsData = self.classifier.predict_proba(featData)
+		return clsData

supv/lrd.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import sklearn.linear_model
+import matplotlib
+import random
+import jprops
+from sklearn.linear_model import LogisticRegression
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+from bacl import *
+# logistic regression classification
+class LogisticRegressionDiscriminant(BaseClassifier):
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["common.mode"] = ("train", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.scale.file.path"] = (None, "missing scale file path")
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.class.field"] = (None, "missing class field ordinal")
+		defValues["train.validation"] = ("kfold", None)
+		defValues["train.num.folds"] = (5, None)
+		defValues["train.penalty"] = ("l2", None)
+		defValues["train.dual"] = (False, None)
+		defValues["train.tolerance"] = (0.0001, None)
+		defValues["train.regularization"] = (1.0, None)
+		defValues["train.fit.intercept"] = (True, None)
+		defValues["train.intercept.scaling"] = (1.0, None)
+		defValues["train.class.weight"] = (None, None)
+		defValues["train.random.state"] = (None, None)
+		defValues["train.solver"] = ("liblinear", None)
+		defValues["train.max.iter"] = (100, None)
+		defValues["train.multi.class"] = ("ovr", None)
+		defValues["train.verbose"] = (0, None)
+		defValues["train.warm.start"] = (False, None)
+		defValues["train.num.jobs"] = (None, None)
+		defValues["train.l1.ratio"] = (None, None)
+		defValues["train.success.criterion"] = ("error", None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["train.search.param.strategy"] = (None, None)
+		defValues["train.search.params"] = (None, None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.data.fields"] = (None, "missing data field ordinals")
+		defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
+		defValues["predict.use.saved.model"] = (False, None)
+		defValues["validate.data.file"] = (None, "missing validation data file")
+		defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
+		defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
+		defValues["validate.data.class.field"] = (None, "missing class field ordinal")
+		defValues["validate.use.saved.model"] = (False, None)
+		defValues["validate.score.method"] = ("accuracy", None)
+		super(LogisticRegressionDiscriminant, self).__init__(configFile, defValues, __name__)
+	# builds model object
+	def buildModel(self):
+		print ("...building logistic regression model")
+		penalty = self.config.getStringConfig("train.penalty")[0]
+		dual = self.config.getBooleanConfig("train.dual")[0]
+		tol = self.config.getFloatConfig("train.tolerance")[0]
+		c = self.config.getFloatConfig("train.regularization")[0]
+		fitIntercept = self.config.getBooleanConfig("train.fit.intercept")[0]
+		interceptScaling = self.config.getFloatConfig("train.intercept.scaling")[0]
+		classWeight = self.config.getStringConfig("train.class.weight")[0]
+		randomState = self.config.getIntConfig("train.random.state")[0]
+		solver = self.config.getStringConfig("train.solver")[0]
+		maxIter = self.config.getIntConfig("train.max.iter")[0]
+		multiClass = self.config.getStringConfig("train.multi.class")[0]
+		verbos = self.config.getIntConfig("train.verbose")[0]
+		warmStart = self.config.getBooleanConfig("train.warm.start")[0]
+		nJobs = self.config.getIntConfig("train.num.jobs")[0]
+		l1Ratio = self.config.getFloatConfig("train.l1.ratio")[0]
+		self.classifier = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=c, fit_intercept=fitIntercept,\
+			intercept_scaling=interceptScaling, class_weight=classWeight, random_state=randomState, solver=solver,\
+			max_iter=maxIter, multi_class=multiClass, verbose=verbos, warm_start=warmStart, n_jobs=nJobs, l1_ratio=l1Ratio)
+		return self.classifier

supv/lstm.py ADDED Viewed

	@@ -0,0 +1,414 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+from torchvision import transforms
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from tnn import FeedForwardNetwork
+"""
+LSTM with one or more hidden layers with multi domensional data
+"""
+class LstmNetwork(nn.Module):
+    def __init__(self, configFile):
+    	"""
+    	In the constructor we instantiate two nn.Linear modules and assign them as
+    	member variables.
+		Parameters
+			configFile : config file path
+    	"""
+    	defValues = dict()
+    	defValues["common.mode"] = ("training", None)
+    	defValues["common.model.directory"] = ("model", None)
+    	defValues["common.model.file"] = (None, None)
+    	defValues["common.preprocessing"] = (None, None)
+    	defValues["common.scaling.method"] = ("zscale", None)
+    	defValues["common.scaling.minrows"] = (50, None)
+    	defValues["common.verbose"] = (False, None)
+    	defValues["common.device"] = ("cpu", None)
+    	defValues["train.data.file"] = (None, "missing training data file path")
+    	defValues["train.data.type"] = ("numeric", None)
+    	defValues["train.data.feat.cols"] = (None, "missing feature columns")
+    	defValues["train.data.target.col"] = (None, "missing target column")
+    	defValues["train.data.delim"] = (",", None)
+    	defValues["train.input.size"] = (None, "missing  input size")
+    	defValues["train.hidden.size"] = (None, "missing  hidden size")
+    	defValues["train.output.size"] = (None, "missing  output size")
+    	defValues["train.num.layers"] = (1, None)
+    	defValues["train.seq.len"] = (1, None)
+    	defValues["train.batch.size"] = (32, None)
+    	defValues["train.batch.first"] = (False, None)
+    	defValues["train.drop.prob"] = (0, None)
+    	defValues["train.optimizer"] = ("adam", None)
+    	defValues["train.opt.learning.rate"] = (.0001, None)
+    	defValues["train.opt.weight.decay"] = (0, None)
+    	defValues["train.opt.momentum"] = (0, None)
+    	defValues["train.opt.eps"] = (1e-08, None)
+    	defValues["train.opt.dampening"] = (0, None)
+    	defValues["train.opt.momentum.nesterov"] = (False, None)
+    	defValues["train.opt.betas"] = ([0.9, 0.999], None)
+    	defValues["train.opt.alpha"] = (0.99, None)
+    	defValues["train.out.sequence"] = (True, None)
+    	defValues["train.out.activation"] = ("sigmoid", None)
+    	defValues["train.loss.fn"] = ("mse", None)
+    	defValues["train.loss.reduction"] = ("mean", None)
+    	defValues["train.grad.clip"] = (5, None)
+    	defValues["train.num.iterations"] = (500, None)
+    	defValues["train.save.model"] = (False, None)
+    	defValues["valid.data.file"] = (None, "missing validation data file path")
+    	defValues["valid.accuracy.metric"] = (None, None)
+    	defValues["predict.data.file"] = (None, None)
+    	defValues["predict.use.saved.model"] = (True, None)
+    	defValues["predict.output"] = ("binary", None)
+    	defValues["predict.feat.pad.size"] = (60, None)
+    	self.config = Configuration(configFile, defValues)
+    	super(LstmNetwork, self).__init__()
+    def getConfig(self):
+    	return self.config
+    def buildModel(self):
+    	"""
+    	Loads configuration and builds the various piecess necessary for the model
+    	"""
+    	torch.manual_seed(9999)
+    	self.verbose = self.config.getStringConfig("common.verbose")[0]
+    	self.inputSize = self.config.getIntConfig("train.input.size")[0]
+    	self.outputSize = self.config.getIntConfig("train.output.size")[0]
+    	self.nLayers = self.config.getIntConfig("train.num.layers")[0]
+    	self.hiddenSize = self.config.getIntConfig("train.hidden.size")[0]
+    	self.seqLen = self.config.getIntConfig("train.seq.len")[0]
+    	self.batchSize = self.config.getIntConfig("train.batch.size")[0]
+    	self.batchFirst = self.config.getBooleanConfig("train.batch.first")[0]
+    	dropProb = self.config.getFloatConfig("train.drop.prob")[0]
+    	self.outSeq = self.config.getBooleanConfig("train.out.sequence")[0]
+    	self.device = FeedForwardNetwork.getDevice(self)
+    	#model
+    	self.lstm = nn.LSTM(self.inputSize, self.hiddenSize, self.nLayers, dropout=dropProb, batch_first=self.batchFirst)
+    	self.linear = nn.Linear(self.hiddenSize, self.outputSize)
+    	outAct = self.config.getStringConfig("train.out.activation")[0]
+    	self.outAct = FeedForwardNetwork.createActivation(outAct)
+    	#load training data
+    	dataFilePath = self.config.getStringConfig("train.data.file")[0]
+    	self.fCols = self.config.getIntListConfig("train.data.feat.cols")[0]
+    	assert len(self.fCols) == 2, "specify only start and end columns of features"
+    	self.tCol = self.config.getIntConfig("train.data.target.col")[0]
+    	self.delim = self.config.getStringConfig("train.data.delim")[0]
+    	self.fData, self.tData = self.loadData(dataFilePath, self.delim, self.fCols[0],self.fCols[1], self.tCol)
+    	self.fData = torch.from_numpy(self.fData)
+    	self.fData = self.fData.to(self.device)
+    	self.tData = torch.from_numpy(self.tData)
+    	self.tData = self.tData.to(self.device)
+    	#load validation data
+    	vaDataFilePath = self.config.getStringConfig("valid.data.file")[0]
+    	self.vfData, self.vtData = self.loadData(vaDataFilePath, self.delim, self.fCols[0], self.fCols[1], self.tCol)
+    	self.vfData = torch.from_numpy(self.vfData)
+    	self.vfData = self.vfData.to(self.device)
+    	self.vtData = torch.from_numpy(self.vtData)
+    	self.vtData = self.vtData.to(self.device)
+    	self.batchSize = self.config.getIntConfig("train.batch.size")[0]
+    	self.dataSize = self.fData.shape[0]
+    	self.numBatch = int(self.dataSize / self.batchSize)
+    	self.restored = False
+    	self.to(self.device)
+    def loadData(self, filePath, delim, scolStart, scolEnd, targetCol):
+    	"""
+    	loads data for file with one sequence per line and data can be a vector
+		Parameters
+			filePath :  file path
+			delim : field delemeter
+			scolStart : seq column start index
+			scolEnd : seq column end index
+			targetCol : target field col index
+    	"""
+    	if targetCol >= 0:
+    		#include target column
+    		cols = list(range(scolStart, scolEnd + 1, 1))
+    		cols.append(targetCol)
+    		data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
+    		#one output for whole sequence
+    		sData = data[:, :-1]
+    		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+    			sData = self.scaleSeqData(sData)
+    		tData = data[:, -1]
+    		#target int (index into class labels)  for classification
+    		sData = sData.astype(np.float32)
+    		tData = tData.astype(np.float32) if self.outputSize == 1 else tData.astype(np.long)
+    		exData =  (sData, tData)
+    	else:
+    		#exclude target column
+    		cols = list(range(scolStart, scolEnd + 1, 1))
+    		data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
+    		#one output for whole sequence
+    		sData = data
+    		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+    			sData = self.scaleSeqData(sData)
+    		#target int (index into class labels)  for classification
+    		sData = sData.astype(np.float32)
+    		exData =  sData
+    	return exData
+    def scaleSeqData(self, sData):
+    	"""
+    	scales data transforming non squence format
+		Parameters
+			sData : sequence data
+    	"""
+    	scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
+    	sData = fromMultDimSeqToTabular(sData, self.inputSize, self.seqLen)
+    	sData = scaleData(sData, scalingMethod)
+    	sData = fromTabularToMultDimSeq(sData, self.inputSize, self.seqLen)
+    	return sData
+    def formattedBatchGenarator(self):
+    	"""
+    	transforms traing data from (dataSize, seqLength x inputSize) to (batch, seqLength, inputSize) tensor
+    	or (seqLength, batch, inputSize) tensor
+    	"""
+    	for _ in range(self.numBatch):
+    		bfData = torch.zeros([self.batchSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
+    		else torch.zeros([self.seqLen, self.batchSize, self.inputSize], dtype=torch.float32)
+    		tdType = torch.float32 if self.outputSize == 1 else torch.long
+    		btData = torch.zeros([self.batchSize], dtype=tdType)
+    		i = 0
+    		for bdi in range(self.batchSize):
+    			di = sampleUniform(0, self.dataSize-1)
+    			row = self.fData[di]
+    			for ci, cv in enumerate(row):
+    				si = int(ci / self.inputSize)
+    				ii = ci % self.inputSize
+    				if self.batchFirst:
+    					bfData[bdi][si][ii] = cv
+    				else:
+    					#print(si, bdi, ii)
+    					bfData[si][bdi][ii] = cv
+    			btData[i] = self.tData[di]
+    			i += 1
+    		#for seq output correct first 2 dimensions
+    		if  self.outSeq and not self.batchFirst:
+    			btData = torch.transpose(btData,0,1)
+    		yield (bfData, btData)
+    def formatData(self, fData, tData=None):
+    	"""
+    	transforms validation or prediction data data from (dataSize, seqLength x inputSize) to
+    	(batch, seqLength, inputSize) tensor or (seqLength, batch, inputSize) tensor
+		Parameters
+			fData : feature data
+			tData : target data
+    	"""
+    	dSize = fData.shape[0]
+    	bfData = torch.zeros([dSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
+    	else torch.zeros([self.seqLen, dSize, self.inputSize], dtype=torch.float32)
+    	for ri in range(dSize):
+    		row = fData[ri]
+    		for ci, cv in enumerate(row):
+    			si = int(ci / self.inputSize)
+    			ii = ci % self.inputSize
+    			if self.batchFirst:
+    				bfData[ri][si][ii] = cv
+    			else:
+    				bfData[si][ri][ii] = cv
+    	if tData is not None:
+    		btData = torch.transpose(tData,0,1) if  self.outSeq and not self.batchFirst else tData
+    		formData =  (bfData, btData)
+    	else:
+    		formData  = bfData
+    	return formData
+    def forward(self, x, h):
+    	"""
+    	Forward pass
+		Parameters
+			x : input data
+			h : targhiddenet state
+    	"""
+    	out, hout = self.lstm(x,h)
+    	if self.outSeq:
+    		# seq to seq prediction
+    		out = out.view(-1, self.hiddenSize)
+    		out = self.linear(out)
+    		if self.outAct is not None:
+    			out = self.outAct(out)
+    		out = out.view(self.batchSize * self.seqLen, -1)
+    	else:
+    		#seq to one prediction
+    		out = out[self.seqLen - 1].view(-1, self.hiddenSize)
+    		out = self.linear(out)
+    		if self.outAct is not None:
+    			out = self.outAct(out)
+    		#out = out.view(self.batchSize, -1)
+    	return out, hout
+    def initHidden(self, batch):
+    	"""
+    	Initialize hidden weights
+		Parameters
+			batch : batch size
+    	"""
+    	hidden = (torch.zeros(self.nLayers,batch,self.hiddenSize),
+    	torch.zeros(self.nLayers,batch,self.hiddenSize))
+    	return hidden
+    def trainLstm(self):
+    	"""
+    	train lstm
+    	"""
+    	print("..starting training")
+    	self.train()
+    	#device = self.config.getStringConfig("common.device")[0]
+    	#self.to(device)
+    	optimizerName = self.config.getStringConfig("train.optimizer")[0]
+    	self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizerName)
+    	lossFn = self.config.getStringConfig("train.loss.fn")[0]
+    	criterion = FeedForwardNetwork.createLossFunction(self, lossFn)
+    	clip = self.config.getFloatConfig("train.grad.clip")[0]
+    	numIter = self.config.getIntConfig("train.num.iterations")[0]
+    	accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
+    	for it in range(numIter):
+    		b = 0
+    		for inputs, labels in self.formattedBatchGenarator():
+    			#forward pass
+    			hid = self.initHidden(self.batchSize)
+    			hid = (hid[0].to(self.device), hid[1].to(self.device))
+    			inputs, labels = inputs.to(self.device), labels.to(self.device)
+    			output, hid = self(inputs, hid)
+    			#loss
+    			if self.outSeq:
+    				labels = labels.view(self.batchSize * self.seqLen, -1)
+    			loss = criterion(output, labels)
+    			if self.verbose and it % 50 == 0 and b % 10 == 0:
+    				print("epoch {}  batch {}  loss {:.6f}".format(it, b, loss.item()))
+    			# zero gradients, perform a backward pass, and update the weights.
+    			self.optimizer.zero_grad()
+    			loss.backward()
+    			nn.utils.clip_grad_norm_(self.parameters(), clip)
+    			self.optimizer.step()
+    			b += 1
+    	#validate
+    	print("..validating model")
+    	self.eval()
+    	with torch.no_grad():
+    		fData, tData = self.formatData(self.vfData, self.vtData)
+    		fData = fData.to(self.device)
+    		vsize = tData.shape[0]
+    		hid = self.initHidden(vsize)
+    		hid = (hid[0].to(self.device), hid[1].to(self.device))
+    		yPred, _ = self(fData, hid)
+    		yPred = yPred.data.cpu().numpy()
+    		yActual = tData.data.cpu().numpy()
+    	if self.verbose:
+    		print("\npredicted \t\t actual")
+    		for i in range(vsize):
+    			print(str(yPred[i]) + "\t" + str(yActual[i]))
+    	score = perfMetric(accMetric, yActual, yPred)
+    	print(formatFloat(3, score, "perf score"))
+    	#save
+    	modelSave = self.config.getBooleanConfig("train.model.save")[0]
+    	if modelSave:
+    		FeedForwardNetwork.saveCheckpt(self)
+    def predictLstm(self):
+    	"""
+    	predict
+    	"""
+    	print("..predicting using model")
+    	useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+    	if useSavedModel:
+    		FeedForwardNetwork.restoreCheckpt(self)
+    	else:
+    		self.trainLstm()
+    	prDataFilePath = self.config.getStringConfig("predict.data.file")[0]
+    	pfData = self.loadData(prDataFilePath, self.delim, self.fCols[0], self.fCols[1], -1)
+    	pfData = torch.from_numpy(pfData)
+    	dsize = pfData.shape[0]
+    	#predict
+    	#device = self.config.getStringConfig("common.device")[0]
+    	self.eval()
+    	with torch.no_grad():
+    		fData = self.formatData(pfData)
+    		fData = fData.to(self.device)
+    		hid = self.initHidden(dsize)
+    		hid = (hid[0].to(self.device), hid[1].to(self.device))
+    		yPred, _ = self(fData, hid)
+    		yPred = yPred.data.cpu().numpy()
+    	if self.outputSize == 2:
+    		#classification
+    		yPred = FeedForwardNetwork.processClassifOutput(yPred, self.config)
+    	# print prediction
+    	FeedForwardNetwork.printPrediction(yPred, self.config, prDataFilePath)

supv/mcalib.py ADDED Viewed

	@@ -0,0 +1,384 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+from sklearn.neighbors import KDTree
+import matplotlib
+import random
+import jprops
+from random import randint
+import statistics
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from tnn import *
+from stats import *
+"""
+neural model calibration
+"""
+class ModelCalibration(object):
+	def __init__(self):
+		pass
+	@staticmethod
+	def findModelCalibration(model):
+		"""
+		pmodel calibration
+		"""
+		FeedForwardNetwork.prepValidate(model)
+		FeedForwardNetwork.validateModel(model)
+		yPred = model.yPred.flatten()
+		yActual = model.validOutData.flatten()
+		nsamp = len(yActual)
+		#print(yPred.shape)
+		#print(yActual.shape)
+		nBins = model.config.getIntConfig("calibrate.num.bins")[0]
+		prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
+		minConf = yPred.min()
+		maxConf = yPred.max()
+		bsize = (maxConf - minConf) / nBins
+		#print("minConf {:.3f}  maxConf {:.3f}  bsize {:.3f}".format(minConf, maxConf, bsize))
+		blist = list(map(lambda i : None, range(nBins)))
+		#binning
+		for yp, ya in zip(yPred, yActual):
+			indx = int((yp - minConf) / bsize)
+			if indx == nBins:
+				indx = nBins - 1
+			#print("yp {:.3f}  indx {}".format(yp, indx))
+			pair = (yp, ya)
+			plist  = blist[indx]
+			if plist is None:
+				plist = list()
+				blist[indx] = plist
+			plist.append(pair)
+		x = list()
+		y = list()
+		yideal = list()
+		ece = 0
+		mce = 0
+		# per bin confidence and accuracy
+		b = 0
+		for plist in blist:
+			if plist is not None:
+				#confidence
+				ypl = list(map(lambda p : p[0], plist))
+				ypm = statistics.mean(ypl)
+				x.append(ypm)
+				#accuracy
+				ypcount = 0
+				for p in plist:
+					yp = 1 if p[0] > prThreshhold else 0
+					if (yp == 1 and p[1] == 1):
+						ypcount += 1
+				acc = ypcount / len(plist)
+				y.append(acc)
+				yideal.append(ypm)
+				ce = abs(ypm - acc)
+				ece += len(plist) * ce
+				if ce > mce:
+					mce = ce
+			else:
+				ypm = minConf + (b + 0.5) * bsize
+				x.append(ypm)
+				yideal.append(ypm)
+				y.append(0)
+			b += 1
+		#calibration plot
+		drawPairPlot(x, y, yideal, "confidence", "accuracy", "actual", "ideal")
+		print("confidence\taccuracy")
+		for z in zip(x,y):
+			print("{:.3f}\t{:.3f}".format(z[0], z[1]))
+		#expected calibration error
+		ece /= nsamp
+		print("expected calibration error\t{:.3f}".format(ece))
+		print("maximum calibration error\t{:.3f}".format(mce))
+	@staticmethod
+	def findModelCalibrationLocal(model):
+		"""
+		pmodel calibration based k nearest neghbors
+		"""
+		FeedForwardNetwork.prepValidate(model)
+		FeedForwardNetwork.validateModel(model)
+		yPred = model.yPred.flatten()
+		yActual = model.validOutData.flatten()
+		nsamp = len(yActual)
+		neighborCnt =  model.config.getIntConfig("calibrate.num.nearest.neighbors")[0]
+		prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
+		fData = model.validFeatData.numpy()
+		tree = KDTree(fData, leaf_size=4)
+		dist, ind = tree.query(fData, k=neighborCnt)
+		calibs = list()
+		#all data
+		for si, ni in enumerate(ind):
+			conf = 0
+			ypcount = 0
+			#all neighbors
+			for i in ni:
+				conf += yPred[i]
+				yp = 1 if yPred[i] > prThreshhold else 0
+				if (yp == 1 and yActual[i] == 1):
+					ypcount += 1
+			conf /= neighborCnt
+			acc = ypcount / neighborCnt
+			calib = (si, conf, acc)
+			calibs.append(calib)
+		#descending sort by difference between confidence and accuracy
+		calibs = sorted(calibs, key=lambda c : abs(c[1] - c[2]), reverse=True)
+		print("local calibration")
+		print("conf\taccu\trecord")
+		for i in range(19):
+			si, conf, acc = calibs[i]
+			rec = toStrFromList(fData[si], 3)
+			print("{:.3f}\t{:.3f}\t{}".format(conf, acc, rec))
+	@staticmethod
+	def findModelSharpness(model):
+		"""
+		pmodel calibration
+		"""
+		FeedForwardNetwork.prepValidate(model)
+		FeedForwardNetwork.validateModel(model)
+		yPred = model.yPred.flatten()
+		yActual = model.validOutData.flatten()
+		nsamp = len(yActual)
+		#print(yPred.shape)
+		#print(yActual.shape)
+		nBins = model.config.getIntConfig("calibrate.num.bins")[0]
+		prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
+		minConf = yPred.min()
+		maxConf = yPred.max()
+		bsize = (maxConf - minConf) / nBins
+		#print("minConf {:.3f}  maxConf {:.3f}  bsize {:.3f}".format(minConf, maxConf, bsize))
+		blist = list(map(lambda i : None, range(nBins)))
+		#binning
+		for yp, ya in zip(yPred, yActual):
+			indx = int((yp - minConf) / bsize)
+			if indx == nBins:
+				indx = nBins - 1
+			#print("yp {:.3f}  indx {}".format(yp, indx))
+			pair = (yp, ya)
+			plist  = blist[indx]
+			if plist is None:
+				plist = list()
+				blist[indx] = plist
+			plist.append(pair)
+		y = list()
+		ypgcount = 0
+		# per bin confidence and accuracy
+		for plist in blist:
+			#ypl = list(map(lambda p : p[0], plist))
+			#ypm = statistics.mean(ypl)
+			#x.append(ypm)
+			ypcount = 0
+			for p in plist:
+				yp = 1 if p[0] > prThreshhold else 0
+				if (yp == 1 and p[1] == 1):
+					ypcount += 1
+					ypgcount += 1
+			acc = ypcount / len(plist)
+			y.append(acc)
+		print("{} {}".format(ypgcount, nsamp))
+		accg = ypgcount / nsamp
+		accgl = [accg] * nBins
+		x = list(range(nBins))
+		drawPairPlot(x, y, accgl, "discretized confidence", "accuracy", "local", "global")
+		contrast = list(map(lambda acc : abs(acc - accg), y))
+		contrast = statistics.mean(contrast)
+		print("contrast {:.3f}".format(contrast))
+"""
+neural model robustness
+"""
+class ModelRobustness(object):
+	def __init__(self):
+		pass
+	def localPerformance(self, model, fpath, nsamp, neighborCnt):
+		"""
+		local performnance sampling
+		"""
+		#load data
+		fData, oData = FeedForwardNetwork.prepData(model, fpath)
+		#print(type(fData))
+		#print(type(oData))
+		#print(fData.shape)
+		dsize = fData.shape[0]
+		ncol = fData.shape[1]
+		#kdd
+		tree = KDTree(fData, leaf_size=4)
+		scores = list()
+		indices = list()
+		for _ in range(nsamp):
+			indx = randomInt(0, dsize - 1)
+			indices.append(indx)
+			frow = fData[indx]
+			frow = np.reshape(frow, (1, ncol))
+			dist, ind = tree.query(frow, k=neighborCnt)
+			ind = ind[0]
+			vfData = fData[ind]
+			voData = oData[ind]
+			#print(type(vfData))
+			#print(vfData.shape)
+			#print(type(voData))
+			#print(voData.shape)
+			model.setValidationData((vfData, voData), False)
+			score = FeedForwardNetwork.validateModel(model)
+			scores.append(score)
+		#performance distribution
+		m, s = basicStat(scores)
+		print("model performance:   mean {:.3f}\tstd dev {:.3f}".format(m,s))
+		drawHist(scores, "model accuracy", "accuracy", "frequency")
+		#worst performance
+		lscores = sorted(zip(indices, scores), key=lambda s : s[1])
+		print(lscores[:5])
+		lines = getFileLines(fpath, None)
+		print("worst performing features regions")
+		for i,s in lscores[:5]:
+			print("score {:.3f}\t{}".format(s, lines[i]))
+"""
+conformal prediction for regression
+"""
+class ConformalRegressionPrediction(object):
+	def __init__(self):
+		self.calibration = dict()
+	def calibrate(self, ypair, confBound):
+		""" n
+		calibration for conformal prediction
+		"""
+		cscores = list()
+		ymax = None
+		ymin = None
+		for yp, ya  in ypair:
+			cscore = abs(yp - ya)
+			cscores.append(cscore)
+			if ymax is None:
+				ymax = ya
+				ymin = ya
+			else:
+				ymax = ya if ya > ymax else ymax
+				ymin = ya if ya < ymin else ymin
+		cscores.sort()
+		drawHist(cscores, "conformal score distribution", "conformal score",  "frequency", 20)
+		cbi = int(confBound * len(cscores))
+		scoreConfBound = cscores[cbi]
+		self.calibration["scoreConfBound"] = scoreConfBound
+		self.calibration["ymin"] = ymin
+		self.calibration["ymax"] = ymax
+		print(self.calibration)
+	def saveCalib(self, fPath):
+		"""
+		saves scoformal score calibration
+		"""
+		saveObject(self.calibration, fPath)
+	def restoreCalib(self, fPath):
+		"""
+		saves scoformal score calibration
+		"""
+		self.calibration = restoreObject(fPath)
+		print(self.calibration)
+	def getPredRange(self, yp, nstep=100):
+		"""
+		get prediction range and related data
+		"""
+		ymin = self.calibration["ymin"]
+		ymax = self.calibration["ymax"]
+		step = (ymax - ymin) / nstep
+		scoreConfBound = self.calibration["scoreConfBound"]
+		rmin = None
+		rmax = None
+		rcount = 0
+		#print(ymin, ymax, step)
+		for ya in np.arange(ymin, ymax, step):
+			cscore = abs(yp - ya)
+			if cscore < scoreConfBound:
+				if rmin is None:
+					#lower bound
+					rmin = ya
+					rmax = ya
+				else:
+					#keep updating upper bound
+					rmax = ya if ya > rmax else rmax
+					rcount += 1
+			else:
+				if rmax is not None	and rcount > 0:
+					#past upper bound
+					break
+		res = dict()
+		res["predRangeMin"] = rmin
+		res["predRangeMax"] = rmax
+		accepted = yp >= rmin and yp <= rmax
+		res["status"] = "accepted" if accepted else "rejected"
+		conf = 1.0 - (rmax - rmin) / (ymax - ymin)
+		res["confidence"] = conf
+		return res

supv/mcclf.py ADDED Viewed

	@@ -0,0 +1,207 @@

+#!/usr/local/bin/python3
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+import jprops
+from random import randint
+from matumizi.util import *
+from matumizi.mlutil import *
+"""
+Markov chain classifier
+"""
+class MarkovChainClassifier():
+	def __init__(self, configFile):
+		"""
+		constructor
+		Parameters
+			configFile: config file path
+		"""
+		defValues = {}
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["common.states"] = (None, "missing state list")
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.class.labels"] = (["F", "T"], None)
+		defValues["train.data.key.len"] = (1, None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.use.saved.model"] = (True, None)
+		defValues["predict.log.odds.threshold"] = (0, None)
+		defValues["validate.data.file"] = (None, "missing validation data file")
+		defValues["validate.use.saved.model"] = (False, None)
+		defValues["valid.accuracy.metric"] = ("acc", None)
+		self.config = Configuration(configFile, defValues)
+		self.stTranPr = dict()
+		self.clabels = self.config.getStringListConfig("train.data.class.labels")[0]
+		self.states = self.config.getStringListConfig("common.states")[0]
+		self.nstates = len(self.states)
+		for cl in self.clabels:
+			stp = np.ones((self.nstates,self.nstates))
+			self.stTranPr[cl] = stp
+	def train(self):
+		"""
+		train model
+		"""
+		#state transition matrix
+		tdfPath = self.config.getStringConfig("train.data.file")[0]
+		klen = self.config.getIntConfig("train.data.key.len")[0]
+		for rec in fileRecGen(tdfPath):
+			cl = rec[klen]
+			rlen = len(rec)
+			for i in range(klen+1, rlen-1, 1):
+				fst = self.states.index(rec[i])
+				tst = self.states.index(rec[i+1])
+				self.stTranPr[cl][fst][tst] += 1
+		#normalize to probability
+		for cl in self.clabels:
+			stp = self.stTranPr[cl]
+			for i in range(self.nstates):
+				s = stp[i].sum()
+				r = stp[i] / s
+				stp[i] = r
+		#save
+		if 	self.config.getBooleanConfig("train.model.save")[0]:
+			mdPath = self.config.getStringConfig("common.model.directory")[0]
+			assert os.path.exists(mdPath), "model save directory does not exist"
+			mfPath = self.config.getStringConfig("common.model.file")[0]
+			mfPath = os.path.join(mdPath, mfPath)
+			with open(mfPath, "w") as fh:
+				for cl in self.clabels:
+					fh.write("label:" + cl +"\n")
+					stp = self.stTranPr[cl]
+					for r in stp:
+						rs = ",".join(toStrList(r, 6)) + "\n"
+						fh.write(rs)
+	def validate(self):
+		"""
+		validate using  model
+		"""
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if useSavedModel:
+			self.__restoreModel()
+		else:
+			self.train()
+		vdfPath = self.config.getStringConfig("validate.data.file")[0]
+		accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
+		yac, ypr = self.__getPrediction(vdfPath, True)
+		if type(self.clabels[0]) == str:
+			yac = self.__toIntClabel(yac)
+			ypr = self.__toIntClabel(ypr)
+		score = perfMetric(accMetric, yac, ypr)
+		print(formatFloat(3, score, "perf score"))
+	def predict(self):
+		"""
+		predict using  model
+		"""
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if useSavedModel:
+			self.__restoreModel()
+		else:
+			self.train()
+		#predict
+		pdfPath = self.config.getStringConfig("predict.data.file")[0]
+		_ , ypr = self.__getPrediction(pdfPath)
+		return ypr
+	def __restoreModel(self):
+		"""
+		restore model
+		"""
+		mdPath = self.config.getStringConfig("common.model.directory")[0]
+		assert os.path.exists(mdPath), "model save directory does not exist"
+		mfPath = self.config.getStringConfig("common.model.file")[0]
+		mfPath = os.path.join(mdPath, mfPath)
+		stp = None
+		cl = None
+		for rec in fileRecGen(mfPath):
+			if len(rec) == 1:
+				if stp is not None:
+					stp = np.array(stp)
+					self.stTranPr[cl] = stp
+				cl = rec[0].split(":")[1]
+				stp = list()
+			else:
+				frec = asFloatList(rec)
+				stp.append(frec)
+		stp = np.array(stp)
+		self.stTranPr[cl] = stp
+	def __getPrediction(self, fpath, validate=False):
+		"""
+		get predictions
+		Parameters
+			fpath : data file path
+			validate: True if validation
+		"""
+		nc = self.clabels[0]
+		pc = self.clabels[1]
+		thold = self.config.getFloatConfig("predict.log.odds.threshold")[0]
+		klen = self.config.getIntConfig("train.data.key.len")[0]
+		offset = klen+1 if validate else klen
+		ypr = list()
+		yac = list()
+		for rec in fileRecGen(fpath):
+			lodds = 0
+			rlen = len(rec)
+			for i in range(offset, rlen-1, 1):
+				fst = self.states.index(rec[i])
+				tst = self.states.index(rec[i+1])
+				odds = self.stTranPr[pc][fst][tst] / self.stTranPr[nc][fst][tst]
+				lodds += math.log(odds)
+			prc = pc if lodds > thold else nc
+			ypr.append(prc)
+			if validate:
+				yac.append(rec[klen])
+			else:
+				recp = prc + "\t" + ",".join(rec)
+				print(recp)
+		re = (yac, ypr)
+		return re
+	def __toIntClabel(self, labels):
+		"""
+		convert string class label to int
+		Parameters
+			labels : class label values
+		"""
+		return list(map(lambda l : self.clabels.index(l), labels))

supv/nlm.py ADDED Viewed

	@@ -0,0 +1,434 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import os
+import sys
+from random import randint
+import random
+import time
+from datetime import datetime
+import re, string, unicodedata
+import spacy
+import torch
+from collections import defaultdict
+import pickle
+import numpy as np
+import re
+from sentence_transformers import CrossEncoder
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+"""
+neural language model
+"""
+class NeuralLangModel(object):
+	def __init__(self):
+		"""
+		initialize
+		"""
+		self.dexts = None
+	def loadDocs(self, fpaths):
+		"""
+		loads documents from one file
+		"""
+		fPaths = fpaths.split(",")
+		if len(fPaths) == 1:
+			if os.path.isfile(fPaths[0]):
+				#one file
+				print("got one file from path")
+				dnames = fpaths
+				docStr = getOneFileContent(fPaths[0])
+				dtexts = [docStr]
+			else:
+				#all files under directory
+				print("got all files under directory from path")
+				dtexts, dnames = getFileContent(fPaths[0])
+				print("found following files")
+				for dt, dn in zip(dtexts, dnames):
+					print(dn + "\t" + dt[:40])
+		else:
+			#list of files
+			print("got list of files from path")
+			dnames = fpaths
+			dtexts = list(map(getOneFileContent, fpaths))
+		ndocs = (dtexts, dnames)
+		return ndocs
+#Encoded doc
+class EncodedDoc:
+	def __init__(self, dtext, dname, drank=None):
+		"""
+		initialize
+		"""
+		self.dtext = dtext
+		self.dname = dname
+		self.drank = drank
+		self.denc = None
+		self.score = None
+	def encode(self, nlp):
+		"""
+		encode
+		"""
+		self.denc = nlp(self.dtext)
+#similarity at token and sentence level for BERT encoding
+class SemanticSearch:
+	def __init__(self, docs=None):
+		"""
+		initialize
+		"""
+		print("loading BERT transformer model")
+		self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
+		self.docs = docs if docs is not None else list()
+	def docAv(self,qu, doc):
+		"""
+		whole doc similarity
+		"""
+		return qu.similarity(doc)
+	def tokSimAv(self, qu, doc):
+		"""
+		token pair wise average
+		"""
+		qts = simAll(qu, doc)
+		asi = numpy.mean(qts)
+		return asi
+	def tokSimMed(self, qu, doc):
+		"""
+		token pair wise average
+		"""
+		qts = simAll(qu, doc)
+		asi = numpy.median(qts)
+		return asi
+	def tokSimMax(self, qu, doc):
+		"""
+		token pair wise max (tsma)
+		"""
+		qte = self. __getTensor(qu)
+		dte = self. __getTensor(doc)
+		return self.simMax(qte, dte)
+	def tokSimAvMax(self, qu, doc):
+		"""
+		token max then average (tsavm)
+		"""
+		qte = self. __getTensor(qu)
+		dte = self. __getTensor(doc)
+		return self.simAvMax(qte, dte)
+	def tokSimMaxAv(self, qu, doc):
+		"""
+		token average and then max
+		"""
+		qte = self. __getTensor(qu)
+		dte = self. __getTensor(doc)
+		return self.simMaxAv(qte, dte)
+	def sentSimAv(self, qu, doc):
+		"""
+		sentence wise average
+		"""
+		qse, dse = self.__sentEnc(qu, doc)
+		sims = self.simAll(qse, dse)
+		return numpy.mean(sims)
+	def sentSimMed(self, qu, doc):
+		"""
+		sentence wise average (ssma)
+		"""
+		qse, dse = self.__sentEnc(qu, doc)
+		sims = self.simAll(qse, dse)
+		return numpy.median(sims)
+	def sentSimMax(self, qu, doc):
+		"""
+		sentence wise average (ssma)
+		"""
+		qse, dse = self.__sentEnc(qu, doc)
+		sims = self.simAll(qse, dse)
+		return numpy.maximum(sims)
+	def sentSimAvMax(self, qu, doc):
+		"""
+		sentence max then average (tsavm)
+		"""
+		qse, dse = self.__sentEnc(qu, doc)
+		return self.simAvMax(qse, dse)
+	def sentSimMaxAv(self, qu, doc):
+		"""
+		sentence average and then max
+		"""
+		qse, dse = self.__sentEnc(qu, doc)
+		return self.simMaxAv(qse, dse)
+	def simMax(self, qte, dte):
+		"""
+		max similarity between 2 elements
+		"""
+		msi = 0
+		for qt in qte:
+			for dt in dte:
+				si = cosineSimilarity(qt, dt)
+				if not math.isnan(si) and si > msi:
+					msi = si
+		return msi
+	def simAvMax(self, qte, dte):
+		"""
+		max then average (tsavm)
+		"""
+		qts = list()
+		for qt in qte:
+			msi = 0
+			for dt in dte:
+				si = cosineSimilarity(qt, dt)
+				if not math.isnan(si) and si > msi:
+					msi = si
+			qts.append(msi)
+		amsi = numpy.mean(numpy.array(qts))
+		return amsi
+	def simMaxAv(self, lqe, lde):
+		"""
+		average and then max
+		"""
+		masi = 0
+		for qe in lqe:
+			qes = list()
+			for de in lde:
+				si = cosineSimilarity(qe, de)
+				if not math.isnan(si):
+					qes.append(si)
+			av = numpy.mean(numpy.array(qes))
+			if av > masi:
+				masi = av
+		return masi
+	def simAll(self, lqe, lde):
+		"""
+		all similarity
+		"""
+		qes = list()
+		for qe in lqe:
+			for de in lde:
+				si = cosineSimilarity(qe, de)
+				if not math.isnan(si):
+					qes.append(si)
+		return numpy.array(qes)
+	def __sentEnc(self, qu, doc):
+		"""
+		sentence encoding for query and doc
+		"""
+		qstr = qu._.trf_word_pieces_
+		qte = zip(qstr, qu._.trf_last_hidden_state)
+		qse = list()
+		for t, v in qte:
+			if t == "[CLS]":
+				qse.append(v)
+		dstr = doc._.trf_word_pieces_
+		dte = zip(dstr, doc._.trf_last_hidden_state)
+		dse = list()
+		for t, v in dte:
+			if t == "[CLS]":
+				dse.append(v)
+		enp = (numpy.array(qse), numpy.array(dse))
+		return enp
+	def __getTensor(self, toks):
+		"""
+		tensors from tokens
+		"""
+		return list(map(lambda t: t.tensor, toks))
+	def addDocs(self, docs):
+		"""
+		add named doc content
+		"""
+		self.docs.extend(docs)
+	def loadDocs(self, fpaths):
+		"""
+		loads documents from one file
+		"""
+		fPaths = fpaths.split(",")
+		if len(fPaths) == 1:
+			if os.path.isfile(fPaths[0]):
+				#one file
+				print("one file")
+				dnames = fpaths
+				docStr = getOneFileContent(fPaths[0])
+				dtexts = [docStr]
+			else:
+				#all files under directory
+				print("all files under directory")
+				dtexts, dnames = getFileContent(fPaths[0])
+				print("found following files")
+				for dt, dn in zip(dtexts, dnames):
+					print(dn + "\t" + dt[:40])
+		else:
+			#list of files
+			print("list of files")
+			dnames = fpaths
+			dtexts = list(map(getOneFileContent, fpaths))
+		docs = list(map(lambda dtext, dname : EncodedDoc(dtext, dname), zip(dtexts, dnames)))
+		self.docs.extend(docs)
+	def search(self, qstr, algo, gdranks=None):
+		"""
+		tensors from tokens
+		"""
+		qv = self.nlp(qstr)
+		res = list()
+		for d in self.docs:
+			dn = d.dname
+			if d.denc == None:
+				d.encode(self.nlp)
+			dv = d.denc
+			if algo == "ds":
+				si = self.docAv(qv, dv)
+			elif algo == "tsa":
+				si = self.tokSimAv(qv, dv)
+			elif algo == "tsme":
+				si = self.tokSimMed(qv, dv)
+			elif algo == "tsma":
+				si = self.tokSimMax(qv, dv)
+			elif algo == "tsavm":
+				si = self.tokSimAvMax(qv, dv)
+			elif algo == "tsmav":
+				si = self.tokSimMaxAv(qv, dv)
+			elif algo == "ssa":
+				si = self.sentSimAv(qv, dv)
+			elif algo == "ssme":
+				si = self.sentSimMed(qv, dv)
+			elif algo == "ssma":
+				si = self.sentSimMax(qv, dv)
+			elif algo == "ssavm":
+				si = self.sentSimAvMax(qv, dv)
+			elif algo == "ssmav":
+				si = self.sentSimMaxAv(qv, dv)
+			else:
+				si = -1.0
+				print("invalid semilarity algo")
+			#print("{} score {:.6f}".format(dn, si))
+			d.score = si
+			r = (dn, si)
+			res.append(r)
+		#search score for each document
+		res.sort(key=lambda r : r[1], reverse=True)
+		print("\nsorted search result")
+		print("query: {}     matching algo: {}".format(qstr, algo))
+		for r in res:
+			print("{} score {:.3f}".format(r[0], r[1]))
+		#rank order if gold truuth rank provided
+		if gdranks is not None:
+			i = 0
+			count = 0
+			for d in gdranks:
+				while i < len(gdranks):
+					if d == res[i][0]:
+						count += 1
+						i += 1
+						break;
+					i += 1
+			ro = count / len(gdranks)
+			print("rank order {:.3f}".format(ro))
+#similarity at passage or paragraph level using sbertcross encoder
+class SemanticSimilaityCrossEnc(NeuralLangModel):
+	def __init__(self, docs=None):
+		self.dparas = None
+		self.scores = None
+		print("loading cross encoder")
+		self.model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")
+		print("done loading cross encoder")
+		super(NeuralLangModel, self).__init__()
+	def paraSimilarity(self, dtext, fpaths, minParNl=1):
+		"""
+		returns paragarph pair similarity across 2 documents
+		"""
+		dtexts, dnames = self.loadDocs(fpaths)
+		if dtext is None:
+			assertEqual(len(dtexts), 2, "exactly 2 files needed")
+			self.dtexts = dtexts
+		else:
+			assertEqual(len(dtexts), 1, "exactly 1 file needed")
+			self.dtexts = list()
+			self.dtexts.append(dtext)
+			self.dtexts.append(dtexts[0])
+		self.dparas = list()
+		for text in self.dtexts:
+			regx = "\n+" if minParNl == 1 else "\n{2,}"
+			paras = re.split(regx, text.replace("\r\n", "\n"))
+			print("no of paras {}".format(len(paras)))
+			self.dparas.append(paras)
+		tinp = list()
+		for para1 in self.dparas[0]:
+			inp = list(map(lambda para2: [para1, para2], self.dparas[1]))
+			tinp.extend(inp)
+		print("input shape " + str(np.array(tinp).shape))
+		scores = self.model.predict(tinp)
+		print("score shape " + str(np.array(scores).shape))
+		#assertEqual(len(scores), len(self.dparas[0]) * len(self.dparas[1]), "no of scores don't match no of paragraph pairs")
+		print(scores)
+		i = 0
+		print("text paragraph pair wise similarity")
+		for para1 in self.dparas[0]:
+			for para2 in self.dparas[1]:
+				print("first: {}\t  second: {}\t  score: {:.6f}".format(para1[:20], para2[:20], scores[i]))
+				i += 1
+		self.scores = scores
+	def avMaxScore(self):
+		"""
+		"""
+		pass
+def ner(text, nlp):
+	#nlp = spacy.load("en_core_web_md")
+	doc = nlp(text)
+	for ent in doc.ents:
+		print(ent.text, ent.start_char, ent.end_char, ent.label_)

supv/optunar.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import torch
+from torch.utils.data import DataLoader
+import random
+import jprops
+from random import randint
+import optuna
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+"""
+neural network hyper paramter tuning with ptuna
+"""
+def createTunerConfig(configFile):
+	"""
+	create tuner config pbject
+	"""
+	defValues = dict()
+	defValues["train.num.layers"] = ([2,4], None)
+	defValues["train.num.units"] = (None, "missing range of number of units")
+	defValues["train.activation"] = ("relu", None)
+	defValues["train.batch.normalize"] = (["true", "false"], None)
+	defValues["train.dropout.prob"] = ([-0.1, 0.5], None)
+	defValues["train.out.num.units"] = (None, "missing number of output units")
+	defValues["train.out.activation"] = (None, "missing output activation")
+	defValues["train.batch.size"] = ([16, 128], None)
+	defValues["train.opt.learning.rate"] = ([.0001, .005], None)
+	config = Configuration(configFile, defValues)
+	return config
+def showStudyResults(study):
+	"""
+	shows study results
+	"""
+	print("Number of finished trials: ", len(study.trials))
+	print("Best trial:")
+	trial = study.best_trial
+	print("Value: ", trial.value)
+	print("Params: ")
+	for key, value in trial.params.items():
+		print("  {}: {}".format(key, value))
+def objective(trial, networkType, modelConfigFile, tunerConfigFile):
+	"""
+	optuna based hyperparamter tuning for neural network
+	"""
+	tConfig = createTunerConfig(tunerConfigFile)
+	#tuning parameters
+	nlayers = config.getIntListConfig("train.num.layers")[0]
+	nunits = config.getIntListConfig("train.num.units")[0]
+	act = config.getStringConfig("train.activation")[0]
+	dropOutRange = config.getFloatListConfig("train.dropout.prob")[0]
+	outNunits = config.getIntConfig("train.out.num.units")[0]
+	outAct = config.getStringConfig("train.out.activation")[0]
+	batchSizes = config.getIntListConfig("train.batch.size")[0]
+	learningRates = config.getFloatListConfig("train.opt.learning.rate")[0]
+	numLayers = trial.suggest_int("numLayers", nlayers[0], nlayers[1])
+	#batch normalize on for all layers or none
+	batchNormOptions = ["true", "false"]
+	batchNorm = trial.suggest_categorical("batchNorm", batchNormOptions)
+	layerConfig = ""
+	maxUnits = nunits[1]
+	sep = ":"
+	for i in range(nlayers):
+		if i < nlayers - 1:
+			nunit = trial.suggest_int("numUnits_l{}".format(i), nunits[0], maxUnits)
+			dropOut = trial.suggest_int("dropOut_l{}".format(i), dropOutRange[0], dropOutRange[1])
+			lconfig = [str(nunit), act, batchNorm, "true", "{:.3f}".format(dropOut)]
+			lconfig = sep.join(lconfig) + ","
+			maxUnits = nunit
+		else:
+			lconfig = [str(outNunits), outAct, "false", "false", "{:.3f}".format(-0.1)]
+			lconfig = sep.join(lconfig)
+		layerConfig = layerConfig + lconfig
+	batchSize = trial.suggest_int("batchSize", batchSizes[0], batchSizes[1])
+	learningRate = trial.suggest_int("learningRate", learningRates[0], learningRates[1])
+	#train model
+	nnModel = FeedForwardNetwork(modelConfigFile)
+	nnModel.setConfigParam("train.layer.data", layerConfig)
+	nnModel.setConfigParam("train.batch.size", batchSize)
+	nnModel.setConfigParam("train.opt.learning.rate", learningRate)
+	nnModel.buildModel()
+	score = FeedForwardNetwork.batchTrain(nnModel)
+	return score
+if __name__ == "__main__":
+	assert len(sys.argv) == 5, "requires 4 command line args"
+	networkType =  sys.argv[1]
+	modelConfigFile = sys.argv[2]
+	tunerConfigFile = sys.argv[3]
+	numTrial = int(sys.argv[4])
+	study = optuna.create_study()
+	study.optimize(lambda trial: objective(trial, networkType, modelConfigFile, tunerConfigFile), n_trials=numTrial)
+	showStudyResults(study)

supv/pasearch.py ADDED Viewed

	@@ -0,0 +1,243 @@

+#!/Users/pranab/Tools/anaconda/bin/python
+# Package imports
+import os
+import sys
+import numpy as np
+import sklearn as sk
+import random
+import jprops
+import abc
+import math
+import random
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+#base parameter search
+class BaseParameterSearch(object):
+	__metaclass__ = abc.ABCMeta
+	def __init__(self, verbose):
+		self.verbose = verbose
+		self.parameters = []
+		self.paramData = {}
+		self.currentParams = []
+		self.curIter = 0
+		self.bestSolution = None
+	# add param name and type
+	def addParam(self, param):
+		self.parameters.append(param)
+	# add param data
+	def addParamVaues(self, paramName, paramData):
+		self.paramData[paramName] = paramData
+	# max iterations
+	def setMaxIter(self, maxIter):
+		self.maxIter = maxIter
+	@abc.abstractmethod
+	def prepare(self):
+		pass
+	@abc.abstractmethod
+	def nextParamValues(self):
+		pass
+	@abc.abstractmethod
+	def setCost(self, cost):
+		pass
+	# get best solution
+	def getBestSolution(self):
+		return self.bestSolution
+#enumerate through provided list of param values
+class GuidedParameterSearch:
+	def __init__(self, verbose=False):
+		self.verbose = verbose
+		self.parameters = []
+		self.paramData = {}
+		self.paramIndexes = []
+		self.numParamValues = []
+		self.currentParams = []
+		self.bestSolution = None
+	# max iterations
+	def setMaxIter(self,maxIter):
+		self.maxIter = maxIter
+	# add param name and type
+	def addParam(self, param):
+		self.parameters.append(param)
+	# add param data
+	def addParamVaues(self, paramName, paramData):
+		self.paramData[paramName] = paramData
+	# prepare
+	def prepare(self):
+		self.numParams = len(self.parameters)
+		for i in range(self.numParams):
+			self.paramIndexes.append(0)
+			#number of values for each parameter
+			paramName = self.parameters[i][0]
+			self.numParamValues.append(len(self.paramData[paramName]))
+		self.curParamIndex = 0
+		paramValueCombList = []
+		paramValueComb = []
+		paramValueCombList.append(paramValueComb)
+		# all params
+		for i in range(self.numParams):
+			paramValueCombListTemp = []
+			for paramValueComb in paramValueCombList:
+				# all param values
+				for j in range(self.numParamValues[i]):
+					paramValueCombTemp = paramValueComb[:]
+					paramValueCombTemp.append(j)
+					paramValueCombListTemp.append(paramValueCombTemp)
+			paramValueCombList = paramValueCombListTemp
+		self.paramValueCombList = paramValueCombList
+		self.numParamValueComb = len(self.paramValueCombList)
+		self.curParamValueCombIndx = 0;
+	# next param combination
+	def nextParamValues(self):
+		retParamNameValue = None
+		if self.curParamValueCombIndx < len(self.paramValueCombList):
+			retParamNameValue = []
+			curParams = self.paramValueCombList[self.curParamValueCombIndx]
+			print (curParams)
+			for i in range(len(curParams)):
+				paramName = self.parameters[i][0]
+				paramValue = self.paramData[paramName][curParams[i]]
+				retParamNameValue.append((paramName, paramValue))
+			self.curParamValueCombIndx = self.curParamValueCombIndx + 1
+			self.currentParams = retParamNameValue
+		return 	retParamNameValue
+	# set cost of current parameter set
+	def setCost(self, cost):
+		if self.bestSolution is not None:
+			if cost < self.bestSolution[1]:
+				self.bestSolution = (self.currentParams, cost)
+		else:
+			self.bestSolution = (self.currentParams, cost)
+	# get best solution
+	def getBestSolution(self):
+		return 	self.bestSolution
+#random search through provided list of parameter values
+class RandomParameterSearch(BaseParameterSearch):
+	def __init__(self, verbose=False):
+		super(RandomParameterSearch, self).__init__(verbose)
+	# prepare
+	def prepare(self):
+		pass
+	# next param combination
+	def nextParamValues(self):
+		retParamNameValue = None
+		if (self.curIter < self.maxIter):
+			retParamNameValue = []
+			for pName, pValues in self.paramData.iteritems():
+				pValue = selectRandomFromList(pValues)
+				retParamNameValue.append((pName, pValue))
+			self.curIter = self.curIter + 1
+			self.currentParams = retParamNameValue
+		return retParamNameValue
+	# set cost of current parameter set
+	def setCost(self, cost):
+		if self.bestSolution is not None:
+			if cost < self.bestSolution[1]:
+				self.bestSolution = (self.currentParams, cost)
+		else:
+			self.bestSolution = (self.currentParams, cost)
+#random search through provided list of parameter values
+class SimulatedAnnealingParameterSearch(BaseParameterSearch):
+	def __init__(self, verbose=False):
+		self.curSolution = None
+		self.nextSolution = None
+		super(SimulatedAnnealingParameterSearch, self).__init__(verbose)
+	# prepare
+	def prepare(self):
+		pass
+	def setTemp(self, temp):
+		self.temp = temp
+	def setTempReductionRate(self, tempRedRate):
+		self.tempRedRate = tempRedRate
+	# next param combination
+	def nextParamValues(self):
+		retParamNameValue = None
+		if (self.curIter == 0):
+			#initial random solution
+			retParamNameValue = []
+			for pName, pValues in self.paramData.iteritems():
+				pValue = selectRandomFromList(pValues)
+				retParamNameValue.append((pName, pValue))
+			self.curIter = self.curIter + 1
+			self.currentParams = retParamNameValue
+		elif (self.curIter < self.maxIter):
+			#perturb current solution
+			retParamNameValue = []
+			#randomly mutate one parameter value
+			(pNameSel, pValue) = selectRandomFromList(self.currentParams)
+			pValueNext = selectRandomFromList(self.paramData[pNameSel])
+			while (pValueNext == pValue):
+				pValueNext = selectRandomFromList(self.paramData[pNameSel])
+			#copy
+			for (pName, pValue) in self.currentParams:
+				if (pName == pNameSel):
+					pValueNew = pValueNext
+				else:
+					pValueNew = pValue
+				retParamNameValue.append((pName, pValueNew))
+			self.curIter = self.curIter + 1
+			self.currentParams = retParamNameValue
+		return retParamNameValue
+	# set cost of current parameter set
+	def setCost(self, cost):
+		if self.curSolution is None:
+			self.curSolution = (self.currentParams, cost)
+			self.bestSolution = (self.currentParams, cost)
+		else:
+			self.nextSolution = (self.currentParams, cost)
+			if (self.nextSolution[1] < self.curSolution[1]):
+				if (self.verbose):
+					print ("next soln better")
+				self.curSolution = self.nextSolution
+				if (self.nextSolution[1] < self.bestSolution[1]):
+					if (self.verbose):
+						print ("next soln better than best")
+					self.bestSolution = self.nextSolution
+			else:
+				if (self.verbose):
+					print ("next soln worst")
+				pr = math.exp((self.curSolution[1] - self.nextSolution[1]) / self.temp)
+				if (pr > random.random()):
+					self.curSolution = self.nextSolution
+					if (self.verbose):
+						print ("next soln worst but accepted")
+				else:
+					if (self.verbose):
+						print ("next soln worst and rejected")
+			self.temp = self.temp * self.tempRedRate

supv/regress.py ADDED Viewed

	@@ -0,0 +1,253 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from io import StringIO
+from sklearn.model_selection import cross_val_score
+import joblib
+from random import randint
+from io import StringIO
+from sklearn.linear_model import LinearRegression
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+class BaseRegressor(object):
+	"""
+	base regression class
+	"""
+	def __init__(self, configFile, defValues):
+		"""
+		intializer
+		"""
+		defValues["common.mode"] = ("train", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.scale.file.path"] = (None, "missing scale file path")
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.out.field"] = (None, "missing out field ordinal")
+		self.config = Configuration(configFile, defValues)
+		self.featData = None
+		self.outData = None
+		self.regressor = None
+		self.verbose = self.config.getBooleanConfig("common.verbose")[0]
+		self.mode = self.config.getBooleanConfig("common.mode")[0]
+		logFilePath = self.config.getStringConfig("common.logging.file")[0]
+		logLevName = self.config.getStringConfig("common.logging.level")[0]
+		self.logger = createLogger(__name__, logFilePath, logLevName)
+		self.logger.info("********* starting session")
+	def initConfig(self, configFile, defValues):
+		"""
+		initialize config
+		"""
+		self.config = Configuration(configFile, defValues)
+	def getConfig(self):
+		"""
+		get config object
+		"""
+		return self.config
+	def setConfigParam(self, name, value):
+		"""
+		set config param
+		"""
+		self.config.setParam(name, value)
+	def getMode(self):
+		"""
+		get mode
+		"""
+		return self.mode
+	def train(self):
+		"""
+		train model
+		"""
+		#build model
+		self.buildModel()
+		# training data
+		if self.featData is None:
+			(featData, outData) = self.prepData("train")
+			(self.featData, self.outData) = (featData, outData)
+		else:
+			(featData, outData) = (self.featData, self.outData)
+		# parameters
+		modelSave = self.config.getBooleanConfig("train.model.save")[0]
+		#train
+		self.logger.info("...training model")
+		self.regressor.fit(featData, outData)
+		rsqScore = self.regressor.score(featData, outData)
+		coef = self.regressor.coef_
+		intc = self.regressor.intercept_
+		result = (rsqScore, intc, coef)
+		if modelSave:
+			self.logger.info("...saving model")
+			modelFilePath = self.getModelFilePath()
+			joblib.dump(self.regressor, modelFilePath)
+		return result
+	def validate(self):
+		# create model
+		self.prepModel()
+		# prepare test data
+		(featData, outDataActual) = self.prepData("validate")
+		#predict
+		self.logger.info("...predicting")
+		outDataPred = self.regressor.predict(featData)
+		#error
+		rsqScore = self.regressor.score(featData, outDataActual)
+		result = (outDataPred, rsqScore)
+		return result
+	def predict(self):
+		"""
+		predict using trained model
+		"""
+		# create model
+		self.prepModel()
+		# prepare test data
+		featData = self.prepData("predict")[0]
+		#predict
+		self.logger.info("...predicting")
+		outData = self.regressor.predict(featData)
+		return outData
+	def prepData(self, mode):
+		"""
+		loads and prepares data for training and validation
+		"""
+		# parameters
+		key = mode  + ".data.file"
+		dataFile = self.config.getStringConfig(key)[0]
+		key = mode  + ".data.fields"
+		fieldIndices = self.config.getStringConfig(key)[0]
+		if not fieldIndices is None:
+			fieldIndices = strToIntArray(fieldIndices, ",")
+		key = mode  + ".data.feature.fields"
+		featFieldIndices = self.config.getStringConfig(key)[0]
+		if not featFieldIndices is None:
+			featFieldIndices = strToIntArray(featFieldIndices, ",")
+		if not mode == "predict":
+			key = mode  + ".data.out.field"
+			outFieldIndex = self.config.getIntConfig(key)[0]
+		#load data
+		(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
+		if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
+			featData = sk.preprocessing.scale(featData)
+		outData = None
+		if not mode == "predict":
+			outData = extrColumns(data, outFieldIndex)
+		return (featData, outData)
+	def prepModel(self):
+		"""
+		load saved model or train model
+		"""
+		useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
+		if (useSavedModel and not self.regressor):
+			# load saved model
+			self.logger.info("...loading saved model")
+			modelFilePath = self.getModelFilePath()
+			self.regressor = joblib.load(modelFilePath)
+		else:
+			# train model
+			self.train()
+class LinearRegressor(BaseRegressor):
+	"""
+	linear regression
+	"""
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["train.normalize"] = (False, None)
+		super(LinearRegressor, self).__init__(configFile, defValues)
+	def buildModel(self):
+		"""
+		builds model object
+		"""
+		self.logger.info("...building linear regression model")
+		normalize = self.config.getBooleanConfig("train.normalize")[0]
+		self.regressor = LinearRegression(normalize=normalize)
+class ElasticNetRegressor(BaseRegressor):
+	"""
+	elastic net regression
+	"""
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["train.alpha"] = (1.0, None)
+		defValues["train.loneratio"] = (0.5, None)
+		defValues["train.normalize"] = (False, None)
+		defValues["train.precompute"] = (False, None)
+		defValues["train.max.iter"] = (1000, None)
+		defValues["train.tol"] = (0.0001, None)
+		defValues["train.random.state"] = (None, None)
+		defValues["train.selection"] = ("cyclic", None)
+		super(ElasticNetRegressor, self).__init__(configFile, defValues)
+	def buildModel(self):
+		"""
+		builds model object
+		"""
+		self.logger.info("...building elastic net regression model")
+		alpha = self.config.getFloatConfig("train.alpha")[0]
+		loneratio = self.config.getFloatConfig("train.loneratio")[0]
+		normalize = self.config.getBooleanConfig("train.normalize")[0]
+		precompute = self.config.getBooleanConfig("train.precompute")[0]
+		maxIter = self.config.getIntConfig("train.max.iter")[0]
+		tol = self.config.getFloatConfig("train.tol")[0]
+		randState = self.config.getIntConfig("train.random.state")[0]
+		selection = self.config.getIntConfig("train.selection")[0]
+		self.regressor =  ElasticNet(alpha=alpha, l1_ratio=loneratio, normalize=normalize, precompute=precompute,
+		max_iter=maxIter, tol=tol, random_state=randState, selection=selection)

supv/rf.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import matplotlib
+import random
+import jprops
+from sklearn.ensemble import RandomForestClassifier
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+from bacl import *
+# gradient boosting classification
+class RandomForest(BaseClassifier):
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["common.mode"] = ("training", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.class.field"] = (None, "missing class field ordinal")
+		defValues["train.validation"] = ("kfold", None)
+		defValues["train.num.folds"] = (5, None)
+		defValues["train.num.trees"] = (100, None)
+		defValues["train.split.criterion"] = ("gini", None)
+		defValues["train.max.depth"] = (None, None)
+		defValues["train.min.samples.split"] = (4, None)
+		defValues["train.min.samples.leaf"] = (2, None)
+		defValues["train.min.weight.fraction.leaf"] = (0, None)
+		defValues["train.max.features"] = ("auto", None)
+		defValues["train.max.leaf.nodes"] = (None, None)
+		defValues["train.min.impurity.decrease"] = (0, None)
+		defValues["train.min.impurity.split"] = (1.0e-07, None)
+		defValues["train.bootstrap"] = (True, None)
+		defValues["train.oob.score"] = (False, None)
+		defValues["train.num.jobs"] = (1, None)
+		defValues["train.random.state"] = (None, None)
+		defValues["train.verbose"] = (0, None)
+		defValues["train.warm.start"] = (False, None)
+		defValues["train.success.criterion"] = ("error", None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["train.search.param.strategy"] = (None, None)
+		defValues["train.search.params"] = (None, None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.data.fields"] = (None, "missing data field ordinals")
+		defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
+		defValues["predict.use.saved.model"] = (False, None)
+		defValues["validate.data.file"] = (None, "missing validation data file")
+		defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
+		defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
+		defValues["validate.data.class.field"] = (None, "missing class field ordinal")
+		defValues["validate.use.saved.model"] = (False, None)
+		defValues["validate.score.method"] = ("accuracy", None)
+		super(RandomForest, self).__init__(configFile, defValues, __name__)
+	# builds model object
+	def buildModel(self):
+		self.logger.info("...building random forest model")
+		numTrees = self.config.getIntConfig("train.num.trees")[0]
+		splitCriterion = self.config.getStringConfig("train.split.criterion")[0]
+		maxDepth = self.config.getStringConfig("train.max.depth")[0]
+		maxDepth = typedValue(maxDepth)
+		minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
+		minSamplesSplit = typedValue(minSamplesSplit)
+		minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf")[0]
+		minSamplesLeaf = typedValue(minSamplesLeaf)
+		minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf")[0]
+		maxFeatures = self.config.getStringConfig("train.max.features")[0]
+		maxFeatures = typedValue(maxFeatures)
+		maxLeafNodes = self.config.getIntConfig("train.max.leaf.nodes")[0]
+		minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.decrease")[0]
+		minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.split")[0]
+		bootstrap = self.config.getBooleanConfig("train.bootstrap")[0]
+		oobScore = self.config.getBooleanConfig("train.oob.score")[0]
+		numJobs = self.config.getIntConfig("train.num.jobs")[0]
+		randomState = self.config.getIntConfig("train.random.state")[0]
+		verbose = self.config.getIntConfig("train.verbose")[0]
+		warmStart = self.config.getBooleanConfig("train.warm.start")[0]
+		model = RandomForestClassifier(n_estimators=numTrees, criterion=splitCriterion, max_depth=maxDepth, \
+		min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, \
+		max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_impurity_decrease=minImpurityDecrease, \
+		min_impurity_split=None, bootstrap=bootstrap, oob_score=oobScore, n_jobs=numJobs, random_state=randomState, \
+		verbose=verbose, warm_start=warmStart, class_weight=None)
+		self.classifier = model
+		return self.classifier
+	#predict probability with in memory data
+	def predictProb(self, recs):
+		# create model
+		self.prepModel()
+		#input record
+		if type(recs) is str:
+			featData = self.prepStringPredictData(recs)
+		else:
+			featData = recs
+		if (featData.ndim == 1):
+			featData = featData.reshape(1, -1)
+		#predict
+		self.logger.info("...predicting class probability")
+		clsData = self.classifier.predict_proba(featData)
+		return clsData

supv/svm.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import sklearn.linear_model
+import matplotlib
+import random
+import jprops
+from random import randint
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+from pasearch import *
+from bacl import *
+# gradient boosting classification
+class SupportVectorMachine(BaseClassifier):
+	def __init__(self, configFile):
+		defValues = {}
+		defValues["common.mode"] = ("train", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.scale.file.path"] = (None, "missing scale file path")
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.class.field"] = (None, "missing class field ordinal")
+		defValues["train.validation"] = ("kfold", None)
+		defValues["train.num.folds"] = (5, None)
+		defValues["train.algorithm"] = ("svc", None)
+		defValues["train.kernel.function"] = ("rbf", None)
+		defValues["train.poly.degree"] = (3, None)
+		defValues["train.penalty"] = (1.0, None)
+		defValues["train.gamma"] = ("scale", None)
+		defValues["train.penalty.norm"] = ("l2", None)
+		defValues["train.loss"] = ("squared_hinge", None)
+		defValues["train.dual"] = (True, None)
+		defValues["train.shrinking"] = (True, None)
+		defValues["train.nu"] = (0.5, None)
+		defValues["train.predict.probability"] = (False, None)
+		defValues["train.print.sup.vectors"] = (False, None)
+		defValues["train.success.criterion"] = ("error", None)
+		defValues["train.model.save"] = (False, None)
+		defValues["train.score.method"] = ("accuracy", None)
+		defValues["train.search.param.strategy"] = (None, None)
+		defValues["train.search.params"] = (None, None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.data.fields"] = (None, "missing data field ordinals")
+		defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
+		defValues["predict.use.saved.model"] = (False, None)
+		defValues["validate.data.file"] = (None, "missing validation data file")
+		defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
+		defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
+		defValues["validate.data.class.field"] = (None, "missing class field ordinal")
+		defValues["validate.use.saved.model"] = (False, None)
+		defValues["validate.score.method"] = ("accuracy", None)
+		super(SupportVectorMachine, self).__init__(configFile, defValues, __name__)
+	# builds model object
+	def buildModel(self):
+		self.logger.info("...building svm model")
+		algo = self.config.getStringConfig("train.algorithm")[0]
+		kernelFun = self.config.getStringConfig("train.kernel.function")[0]
+		penalty = self.config.getFloatConfig("train.penalty")[0]
+		polyDegree = self.config.getIntConfig("train.poly.degree")[0]
+		kernelCoeff =  self.config.getStringConfig("train.gamma")[0]
+		kernelCoeff = typedValue(kernelCoeff)
+		penaltyNorm = self.config.getStringConfig("train.penalty.norm")[0]
+		trainLoss = self.config.getStringConfig("train.loss")[0]
+		dualOpt = self.config.getBooleanConfig("train.dual")[0]
+		shrinkHeuristic = self.config.getBooleanConfig("train.shrinking")[0]
+		predictProb = self.config.getBooleanConfig("train.predict.probability")[0]
+		supVecBound = self.config.getFloatConfig("train.nu")[0]
+		if (algo == "svc"):
+			if kernelFun == "poly":
+				model = sk.svm.SVC(C=penalty,kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
+				probability=predictProb)
+			elif kernelFun == "rbf" or kernelFun == "sigmoid":
+				model = sk.svm.SVC(C=penalty,kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
+			else:
+				model = sk.svm.SVC(C=penalty, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
+		elif (algo == "nusvc"):
+			if kernelFun == "poly":
+				model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
+				probability=predictProb)
+			elif kernelFun == "rbf" or kernelFun == "sigmoid":
+				model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
+			else:
+				model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
+		elif (algo == "linearsvc"):
+			model = sk.svm.LinearSVC(penalty=penaltyNorm, loss=trainLoss, dual=dualOpt)
+		else:
+			self.logger.info("invalid svm algorithm")
+			sys.exit()
+		self.classifier = model
+		return self.classifier
+	#predict probability with in memory data
+	def predictProb(self, recs):
+		# create model
+		self.prepModel()
+		#input record
+		if type(recs) is str:
+			featData = self.prepStringPredictData(recs)
+		else:
+			featData = recs
+		if (featData.ndim == 1):
+			featData = featData.reshape(1, -1)
+		#predict
+		self.logger.info("...predicting class probability")
+		clsData = self.classifier.predict_proba(featData)
+		return clsData

supv/svml.py ADDED Viewed

	@@ -0,0 +1,428 @@

+#!/Users/pranab/Tools/anaconda/bin/python
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn as sk
+import sklearn.linear_model
+import matplotlib
+import random
+import jprops
+from sklearn.externals import joblib
+from sklearn.ensemble import BaggingClassifier
+from random import randint
+if len(sys.argv) < 2:
+	print "usage: ./svm.py <config_properties_file>"
+	sys.exit()
+#train by bagging
+def train_bagging():
+	model = build_model()
+	bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
+	max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
+	#train model
+	bagging_model.fit(XC, yc)
+	#persist model
+	if persist_model:
+		models = bagging_model.estimators_
+		for m in zip(range(0, len(models)), models):
+			model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
+			joblib.dump(m[1], model_file)
+	score = bagging_model.score(XC, yc)
+	print "average error %.3f" %(1.0 - score)
+#linear k fold validation
+def train_kfold_validation(nfold):
+	if native_kfold_validation:
+		print "native linear kfold validation"
+		model = build_model()
+		scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
+		av_score = np.mean(scores)
+		print "average error %.3f" %(1.0 - av_score)
+	else:
+		print "extended linear kfold validation"
+		train_kfold_validation_ext(nfold)
+#linear k fold validation
+def train_kfold_validation_ext(nfold):
+	model = build_model()
+	#scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
+	#print scores
+	offset = 0
+	length = dsize / nfold
+	errors = []
+	fp_errors = []
+	fn_errors = []
+	for i in range(0, nfold):
+		print "....Next fold %d" %(i)
+		#split data
+		(XV,yv,X,y) = split_data(offset, length)
+		dvsize = len(XV)
+		#train model
+		model.fit(X, y)
+		#persist model
+		if persist_model:
+			model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
+			joblib.dump(model, model_file)
+		#print support vectors
+		print_support_vectors(model)
+		#predict
+		print "making predictions..."
+		yp = model.predict(XV)
+		#show prediction output
+		(er, fp_er, fn_er) = validate(dvsize,yv,yp)
+		errors.append(er)
+		fp_errors.append(fp_er)
+		fn_errors.append(fn_er)
+		offset += length
+	#average error
+	av_error = np.mean(errors)
+	av_fp_error = np.mean(fp_errors)
+	av_fn_error = np.mean(fn_errors)
+	print "average  error %.3f  false positive error %.3f  false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
+# random k fold validation
+def train_rfold_validation(nfold, niter):
+	if native_rfold_validation:
+		print "native random  kfold validation"
+		train_fraction = 1.0 / nfold
+		scores = []
+		for i in range(0,niter):
+			state = randint(1,100)
+			X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state)
+			model = build_model()
+			model.fit(X,y)
+			scores.append(model.score(XV, yv))
+		print scores
+		av_score = np.mean(scores)
+		print "average error %.3f" %(1.0 - av_score)
+	else:
+		print "extended random  kfold validation"
+		train_rfold_validation_ext(nfold, niter)
+# random k fold validation
+def train_rfold_validation_ext(nfold, niter):
+	max_offset_frac = 1.0 - 1.0 / nfold
+	max_offset_frac -= .01
+	length = dsize / nfold
+	errors = []
+	fp_errors = []
+	fn_errors = []
+	for i in range(0,niter):
+		print "...Next iteration %d" %(i)
+		offset = int(dsize * random.random() * max_offset_frac)
+		print "offset: %d  length: %d" %(offset, length)
+		(XV,yv,X,y) = split_data(offset, length)
+		dvsize = len(XV)
+		#build model
+		model = build_model()
+		#train model
+		model.fit(X, y)
+		#persist model
+		if persist_model:
+			model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
+			print "saving model file " +  model_file
+			joblib.dump(model, model_file)
+		#print support vectors
+		print_support_vectors(model)
+		#predict
+		print "making predictions..."
+		yp = model.predict(XV)
+		#show prediction output
+		(er, fp_er, fn_er) = validate(dvsize,yv,yp)
+		errors.append(er)
+		fp_errors.append(fp_er)
+		fn_errors.append(fn_er)
+	av_error = np.mean(errors)
+	av_fp_error = np.mean(fp_errors)
+	av_fn_error = np.mean(fn_errors)
+	print "average error %.3f  false positive error %.3f  false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
+# make predictions
+def predict():
+	psize = len(X)
+	class_counts = []
+	#all models
+	for i in range(0, num_models):
+		model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
+		print "loading model file " +  model_file
+		model = joblib.load(model_file)
+		yp = model.predict(X)
+		if i == 0:
+			#initialize class counts
+			for y in yp:
+				class_count = {}
+				if y == 0:
+					class_count[0] = 1
+					class_count[1] = 0
+				else:
+					class_count[1] = 1
+					class_count[0] = 0
+				class_counts.append(class_count)
+		else:
+			#increment class count
+			for j in range(0, psize):
+				class_count = class_counts[j]
+				y = yp[j]
+				class_count[y] +=  1
+	# predict based on majority vote
+	print "here are the predictions"
+	for k in range(0, psize):
+		class_count = class_counts[k]
+		if (class_count[0] > class_count[1]):
+			y = 0
+			majority = class_count[0]
+		else:
+			y = 1
+			majority = class_count[1]
+		print X[k]
+		print "prediction %d  majority count %d" %(y, majority)
+#builds model
+def build_model():
+	#build model
+	print "building model..."
+	if algo == "svc":
+		if kernel_fun == "poly":
+			model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
+		elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
+			model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff)
+		else:
+			model = sk.svm.SVC(C=penalty,kernel=kernel_fun)
+	elif algo == "nusvc":
+		if kernel_fun == "poly":
+			model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
+		elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
+			model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff)
+		else:
+			model = sk.svm.NuSVC(kernel=kernel_fun)
+	elif algo == "linearsvc":
+		model = sk.svm.LinearSVC()
+	else:
+		print "invalid svm algorithm"
+		sys.exit()
+	return model
+#splits data into training and validation sets
+def split_data(offset, length):
+	print "splitting data..."
+	#copy data
+	XC_c = np.copy(XC)
+	yc_c = list(yc)
+	# validation set
+	vlo = offset
+	vup = vlo + length
+	if (vup > len(yc)):
+		vup = len(yc)
+	XV = XC_c[vlo:vup:1]
+	yv = yc_c[vlo:vup:1]
+	dvsize = len(XV)
+	print "data size %d validation data size %d" %(dsize, dvsize)
+	#print "validation set"
+	#print XV
+	#print yv
+	#training set
+	X = np.delete(XC_c, np.s_[vlo:vup:1], 0)
+	y = np.delete(yc_c, np.s_[vlo:vup:1], 0)
+	#print "training set"
+	#print X
+	#print y
+	return (XV,yv,X,y)
+#print support vectors
+def print_support_vectors(model):
+	if (not algo == "linearsvc"):
+		if print_sup_vectors:
+			print "showing support vectors..."
+			print model.support_vectors_
+		print "num of support vectors"
+		print model.n_support_
+#prints prediction output
+def validate(dvsize,yv,yp):
+	print "showing predictions..."
+	err_count = 0
+	tp = 0
+	tn = 0
+	fp = 0
+	fn = 0
+	for r in range(0,dvsize):
+		#print "actual: %d  predicted: %d" %(yv[r], yp[r])
+		if (not yv[r] ==  yp[r]):
+			err_count += 1
+		if (yp[r] == 1 and yv[r] == 1):
+			tp += 1
+		elif (yp[r] == 1 and yv[r] == 0):
+			fp += 1
+		elif (yp[r] == 0 and yv[r] == 0):
+			tn += 1
+		else:
+			fn += 1
+	er = float(err_count) / dvsize
+	fp_er = float(fp) / dvsize
+	fn_er = float(fn) / dvsize
+	print "error %.3f" %(er)
+	print "true positive : %.3f" %(float(tp) / dvsize)
+	print "false positive: %.3f" %(fp_er)
+	print "true negative : %.3f" %(float(tn) / dvsize)
+	print "false negative: %.3f" %(fn_er)
+	return (er, fp_er, fn_er)
+# load configuration
+def getConfigs(configFile):
+	configs = {}
+	print "using following configurations"
+	with open(configFile) as fp:
+  		for key, value in jprops.iter_properties(fp):
+			print key, value
+			configs[key] = value
+	return configs
+# load configuration
+configs = getConfigs(sys.argv[1])
+mode = configs["common.mode"]
+if mode == "train":
+	#train
+	print "running in train mode"
+	data_file = configs["train.data.file"]
+	feat_field_indices = configs["train.data.feature.fields"].split(",")
+	feat_field_indices = [int(a) for a in feat_field_indices]
+	class_field_index = int(configs["train.data.class.field"])
+	preprocess = configs["common.preprocessing"]
+	validation = configs["train.validation"]
+	num_folds = int(configs["train.num.folds"])
+	num_iter = int(configs["train.num.iter"])
+	algo = configs["train.algorithm"]
+	kernel_fun = configs["train.kernel.function"]
+	poly_degree = int(configs["train.poly.degree"])
+	penalty = float(configs["train.penalty"])
+	if penalty < 0:
+		penalty = 1.0
+		print "using default for penalty"
+	kernel_coeff = float(configs["train.gamma"])
+	if kernel_coeff < 0:
+		kernel_coeff = 'auto'
+		print "using default for gamma"
+	print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true"
+	persist_model = configs["train.persist.model"].lower() == "true"
+	model_file_directory = configs["common.model.directory"]
+	model_file_prefix = configs["common.model.file.prefix"]
+	print feat_field_indices
+	#extract feature fields
+	d = np.loadtxt(data_file, delimiter=',')
+	dsize = len(d)
+	XC = d[:,feat_field_indices]
+	#preprocess features
+	if (preprocess == "scale"):
+		XC = sk.preprocessing.scale(XC)
+	elif (preprocess == "normalize"):
+		XC = sk.preprocessing.normalize(XC, norm='l2')
+	else:
+		print "no preprocessing done"
+	#extract output field
+	yc = d[:,[class_field_index]]
+	yc = yc.reshape(dsize)
+	yc = [int(a) for a in yc]
+	#print XC
+	#print yc
+	# train model
+	if validation == "kfold":
+		native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true"
+		train_kfold_validation(num_folds)
+	elif validation == "rfold":
+		native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true"
+		train_rfold_validation(num_folds,num_iter)
+	elif validation == "bagging":
+		bagging_num_estimator = int(configs["train.bagging.num.estimators"])
+		bagging_sample_fraction = float(configs["train.bagging.sample.fraction"])
+		bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true"
+		train_bagging()
+	else:
+		print "invalid training validation method"
+		sys.exit()
+else:
+	#predict
+	print "running in prediction mode"
+	pred_data_file = configs["pred.data.file"]
+	pred_feat_field_indices = configs["pred.data.feature.fields"].split(",")
+	pred_feat_field_indices = [int(a) for a in pred_feat_field_indices]
+	preprocess = configs["common.preprocessing"]
+	num_models = int(configs["pred.num.models"])
+	model_file_directory = configs["common.model.directory"]
+	model_file_prefix = configs["common.model.file.prefix"]
+	#extract feature fields
+	pd = np.loadtxt(pred_data_file, delimiter=',')
+	pdsize = len(pd)
+	X = pd[:,pred_feat_field_indices]
+	#preprocess features
+	if (preprocess == "scale"):
+		X = sk.preprocessing.scale(X)
+	elif (preprocess == "normalize"):
+		X = sk.preprocessing.normalize(X, norm='l2')
+	else:
+		print "no preprocessing done"
+	predict()

supv/tnn.py ADDED Viewed

	@@ -0,0 +1,789 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch.autograd import Variable
+from torch.utils.data import Dataset, TensorDataset
+from torch.utils.data import DataLoader
+import sklearn as sk
+from sklearn.neighbors import KDTree
+import matplotlib
+import random
+import jprops
+from random import randint
+import statistics
+sys.path.append(os.path.abspath("../lib"))
+from util import *
+from mlutil import *
+"""
+forward hook function
+"""
+intermedOut = {}
+lvalues = list()
+def hookFn(m, i, o):
+	"""
+	call back for latent values
+	"""
+	#intermedOut[m] = o
+	lv = o.data.cpu().numpy()
+	lv = lv[0].tolist()
+	lvalues.append(lv)
+	#print(lv)
+def getLatValues():
+	"""
+	"""
+	return lvalues
+class FeedForwardNetwork(torch.nn.Module):
+	def __init__(self, configFile, addDefValues=None):
+		"""
+    	In the constructor we instantiate two nn.Linear modules and assign them as
+    	member variables.
+		Parameters
+			configFile : config file path
+			addDefValues : dictionary of additional default values
+		"""
+		defValues = dict() if addDefValues is None else addDefValues.copy()
+		defValues["common.mode"] = ("training", None)
+		defValues["common.model.directory"] = ("model", None)
+		defValues["common.model.file"] = (None, None)
+		defValues["common.preprocessing"] = (None, None)
+		defValues["common.scaling.method"] = ("zscale", None)
+		defValues["common.scaling.minrows"] = (50, None)
+		defValues["common.scaling.param.file"] = (None, None)
+		defValues["common.verbose"] = (False, None)
+		defValues["common.device"] = ("cpu", None)
+		defValues["train.data.file"] = (None, "missing training data file")
+		defValues["train.data.fields"] = (None, "missing training data field ordinals")
+		defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.data.out.fields"] = (None, "missing training data feature field ordinals")
+		defValues["train.layer.data"] = (None, "missing layer data")
+		defValues["train.input.size"] = (None, None)
+		defValues["train.output.size"] = (None, "missing  output size")
+		defValues["train.batch.size"] = (10, None)
+		defValues["train.loss.reduction"] = ("mean", None)
+		defValues["train.num.iterations"] = (500, None)
+		defValues["train.lossFn"] = ("mse", None)
+		defValues["train.optimizer"] = ("sgd", None)
+		defValues["train.opt.learning.rate"] = (.0001, None)
+		defValues["train.opt.weight.decay"] = (0, None)
+		defValues["train.opt.momentum"] = (0, None)
+		defValues["train.opt.eps"] = (1e-08, None)
+		defValues["train.opt.dampening"] = (0, None)
+		defValues["train.opt.momentum.nesterov"] = (False, None)
+		defValues["train.opt.betas"] = ([0.9, 0.999], None)
+		defValues["train.opt.alpha"] = (0.99, None)
+		defValues["train.save.model"] = (False, None)
+		defValues["train.track.error"] = (False, None)
+		defValues["train.epoch.intv"] = (5, None)
+		defValues["train.batch.intv"] = (5, None)
+		defValues["train.print.weights"] = (False, None)
+		defValues["valid.data.file"] = (None, None)
+		defValues["valid.accuracy.metric"] = (None, None)
+		defValues["predict.data.file"] = (None, None)
+		defValues["predict.use.saved.model"] = (True, None)
+		defValues["predict.output"] = ("binary", None)
+		defValues["predict.feat.pad.size"] = (60, None)
+		defValues["predict.print.output"] = (True, None)
+		defValues["calibrate.num.bins"] = (10, None)
+		defValues["calibrate.pred.prob.thresh"] = (0.5, None)
+		defValues["calibrate.num.nearest.neighbors"] = (10, None)
+		self.config = Configuration(configFile, defValues)
+		super(FeedForwardNetwork, self).__init__()
+	def setConfigParam(self, name, value):
+		"""
+		set config param
+		Parameters
+			name : config name
+			value : config value
+		"""
+		self.config.setParam(name, value)
+	def getConfig(self):
+		"""
+		get config object
+		"""
+		return self.config
+	def setVerbose(self, verbose):
+		self.verbose = verbose
+	def buildModel(self):
+		"""
+    	Loads configuration and builds the various piecess necessary for the model
+		"""
+		torch.manual_seed(9999)
+		self.verbose = self.config.getBooleanConfig("common.verbose")[0]
+		numinp = self.config.getIntConfig("train.input.size")[0]
+		if numinp is None:
+			numinp = len(self.config.getIntListConfig("train.data.feature.fields")[0])
+		#numOut = len(self.config.getStringConfig("train.data.out.fields")[0].split(","))
+		self.outputSize = self.config.getIntConfig("train.output.size")[0]
+		self.batchSize = self.config.getIntConfig("train.batch.size")[0]
+		#lossRed = self.config.getStringConfig("train.loss.reduction")[0]
+		#learnRate = self.config.getFloatConfig("train.opt.learning.rate")[0]
+		self.numIter = self.config.getIntConfig("train.num.iterations")[0]
+		optimizer = self.config.getStringConfig("train.optimizer")[0]
+		self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
+		self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
+		self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
+		self.batchIntv = self.config.getIntConfig("train.batch.intv")[0]
+		self.restored = False
+		self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
+		#build network
+		layers = list()
+		ninp = numinp
+		trData =  self.config.getStringConfig("train.layer.data")[0].split(",")
+		for ld in trData:
+			lde = ld.split(":")
+			assert len(lde) == 5, "expecting 5 items for layer data"
+			#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
+			nunit = int(lde[0])
+			actStr = lde[1]
+			act = FeedForwardNetwork.createActivation(actStr) if actStr != "none"  else None
+			bnorm = lde[2] == "true"
+			afterAct = lde[3] == "true"
+			dpr = float(lde[4])
+			layers.append(torch.nn.Linear(ninp, nunit))
+			if bnorm:
+				#with batch norm
+				if afterAct:
+					safeAppend(layers, act)
+					layers.append(torch.nn.BatchNorm1d(nunit))
+				else:
+					layers.append(torch.nn.BatchNorm1d(nunit))
+					safeAppend(layers, act)
+			else:
+				#without batch norm
+				safeAppend(layers, act)
+			if dpr > 0:
+				layers.append(torch.nn.Dropout(dpr))
+			ninp = nunit
+		self.layers = torch.nn.Sequential(*layers)
+		self.device = FeedForwardNetwork.getDevice(self)
+		#training data
+		dataFile = self.config.getStringConfig("train.data.file")[0]
+		(featData, outData) = FeedForwardNetwork.prepData(self, dataFile)
+		self.featData = torch.from_numpy(featData)
+		self.outData = torch.from_numpy(outData)
+		#validation data
+		dataFile = self.config.getStringConfig("valid.data.file")[0]
+		(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)
+		self.validFeatData = torch.from_numpy(featDataV)
+		self.validOutData = torch.from_numpy(outDataV)
+		# loss function and optimizer
+		self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
+		self.optimizer =  FeedForwardNetwork.createOptimizer(self, optimizer)
+		self.yPred  = None
+		self.restored = False
+		#mode to device
+		self.device = FeedForwardNetwork.getDevice(self)
+		self.featData = self.featData.to(self.device)
+		self.outData = self.outData.to(self.device)
+		self.validFeatData = self.validFeatData.to(self.device)
+		self.to(self.device)
+	@staticmethod
+	def getDevice(model):
+		"""
+		gets device
+		Parameters
+			model : torch model
+		"""
+		devType = model.config.getStringConfig("common.device")[0]
+		if devType == "cuda":
+			if torch.cuda.is_available():
+				device = torch.device("cuda")
+			else:
+				exitWithMsg("cuda not available")
+		else:
+			device = torch.device("cpu")
+		return device
+	def setValidationData(self, dataSource, prep=True):
+		"""
+		sets validation data
+		Parameters
+			dataSource : data source str if file path or 2D array
+			prep : if True load and prepare
+		"""
+		if prep:
+			(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)
+			self.validFeatData = torch.from_numpy(featDataV)
+			self.validOutData = outDataV
+		else:
+			self.validFeatData = torch.from_numpy(dataSource[0])
+			self.validOutData = dataSource[1]
+		self.validFeatData = self.validFeatData.to(self.device)
+	@staticmethod
+	def createActivation(actName):
+		"""
+		create activation
+		Parameters
+			actName : activation name
+		"""
+		if actName is None:
+			activation = None
+		elif actName == "relu":
+			activation = torch.nn.ReLU()
+		elif actName == "tanh":
+			activation = torch.nn.Tanh()
+		elif actName == "sigmoid":
+			activation = torch.nn.Sigmoid()
+		elif actName == "softmax":
+			activation = torch.nn.Softmax(dim=1)
+		else:
+			exitWithMsg("invalid activation function name " + actName)
+		return activation
+	@staticmethod
+	def createLossFunction(model, lossFnName):
+		"""
+		create loss function
+		Parameters
+			lossFnName : loss function name
+		"""
+		config = model.config
+		lossRed = config.getStringConfig("train.loss.reduction")[0]
+		if lossFnName == "ltwo" or lossFnName == "mse":
+			lossFunc = torch.nn.MSELoss(reduction=lossRed)
+		elif lossFnName == "ce":
+			lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)
+		elif lossFnName == "lone" or lossFnName == "mae":
+			lossFunc = torch.nn.L1Loss(reduction=lossRed)
+		elif lossFnName == "bce":
+			lossFunc = torch.nn.BCELoss(reduction=lossRed)
+		elif lossFnName == "bcel":
+			lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)
+		elif lossFnName == "sm":
+			lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)
+		elif lossFnName == "mlsm":
+			lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)
+		else:
+			exitWithMsg("invalid loss function name " + lossFnName)
+		return lossFunc
+	@staticmethod
+	def createOptimizer(model, optName):
+		"""
+		create optimizer
+		Parameters
+			optName : optimizer name
+		"""
+		config = model.config
+		learnRate = config.getFloatConfig("train.opt.learning.rate")[0]
+		weightDecay = config.getFloatConfig("train.opt.weight.decay")[0]
+		momentum = config.getFloatConfig("train.opt.momentum")[0]
+		eps = config.getFloatConfig("train.opt.eps")[0]
+		if optName == "sgd":
+			dampening = config.getFloatConfig("train.opt.dampening")[0]
+			momentumNesterov = config.getBooleanConfig("train.opt.momentum.nesterov")[0]
+			optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum,
+			dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)
+		elif optName == "adam":
+		   	betas = config.getFloatListConfig("train.opt.betas")[0]
+		   	betas = (betas[0], betas[1])
+		   	optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,
+    		weight_decay=weightDecay)
+		elif optName == "rmsprop":
+			alpha = config.getFloatConfig("train.opt.alpha")[0]
+			optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,
+			eps=eps, weight_decay=weightDecay, momentum=momentum)
+		else:
+			exitWithMsg("invalid optimizer name " + optName)
+		return optimizer
+	def forward(self, x):
+		"""
+    	In the forward function we accept a Tensor of input data and we must return
+    	a Tensor of output data. We can use Modules defined in the constructor as
+    	well as arbitrary (differentiable) operations on Tensors.
+		Parameters
+			x : data batch
+		"""
+		y = self.layers(x)
+		return y
+	@staticmethod
+	def addForwardHook(model, l, cl = 0):
+		"""
+		register forward hooks
+		Parameters
+			l :
+			cl :
+		"""
+		for name, layer in model._modules.items():
+			#If it is a sequential, don't register a hook on it
+			# but recursively register hook on all it's module children
+			print(str(cl) + " : " + name)
+			if isinstance(layer, torch.nn.Sequential):
+				FeedForwardNetwork.addForwardHook(layer, l, cl)
+			else:
+			#	 it's a non sequential. Register a hook
+				if cl == l:
+					print("setting hook at layer " + str(l))
+					layer.register_forward_hook(hookFn)
+				cl += 1
+	@staticmethod
+	def prepData(model, dataSource, includeOutFld=True):
+		"""
+		loads and prepares  data
+		Parameters
+			dataSource : data source str if file path or 2D array
+			includeOutFld : True if target freld to be included
+		"""
+		# parameters
+		fieldIndices = model.config.getIntListConfig("train.data.fields")[0]
+		featFieldIndices = model.config.getIntListConfig("train.data.feature.fields")[0]
+		#all data and feature data
+		isDataFile = isinstance(dataSource, str)
+		selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]
+		if isDataFile:
+			#source file path
+			(data, featData) = loadDataFile(dataSource, ",", selFieldIndices, featFieldIndices)
+		else:
+			# tabular data
+			data = tableSelFieldsFilter(dataSource, selFieldIndices)
+			featData = tableSelFieldsFilter(data, featFieldIndices)
+			#print(featData)
+			featData = np.array(featData)
+		if (model.config.getStringConfig("common.preprocessing")[0] == "scale"):
+		    scalingMethod = model.config.getStringConfig("common.scaling.method")[0]
+		    #scale only if there are enough rows
+		    nrow = featData.shape[0]
+		    minrows = model.config.getIntConfig("common.scaling.minrows")[0]
+		    if nrow > minrows:
+		    	#in place scaling
+		    	featData = scaleData(featData, scalingMethod)
+		    else:
+		    	#use pre computes scaling parameters
+		    	spFile = model.config.getStringConfig("common.scaling.param.file")[0]
+		    	if spFile is None:
+		    		exitWithMsg("for small data sets pre computed scaling parameters need to provided")
+		    	scParams = restoreObject(spFile)
+		    	featData = scaleDataWithParams(featData, scalingMethod, scParams)
+		    	featData = np.array(featData)
+		# target data
+		if includeOutFld:
+			outFieldIndices = model.config.getStringConfig("train.data.out.fields")[0]
+			outFieldIndices = strToIntArray(outFieldIndices, ",")
+			if isDataFile:
+				outData = data[:,outFieldIndices]
+			else:
+				outData = tableSelFieldsFilter(data, outFieldIndices)
+				outData = np.array(outData)
+			foData = (featData.astype(np.float32), outData.astype(np.float32))
+		else:
+			foData = featData.astype(np.float32)
+		return foData
+	@staticmethod
+	def saveCheckpt(model):
+		"""
+		checkpoints model
+		Parameters
+			model : torch model
+		"""
+		print("..saving model checkpoint")
+		modelDirectory = model.config.getStringConfig("common.model.directory")[0]
+		assert os.path.exists(modelDirectory), "model save directory does not exist"
+		modelFile = model.config.getStringConfig("common.model.file")[0]
+		filepath = os.path.join(modelDirectory, modelFile)
+		state = {"state_dict": model.state_dict(), "optim_dict": model.optimizer.state_dict()}
+		torch.save(state, filepath)
+		if model.verbose:
+			print("model saved")
+	@staticmethod
+	def restoreCheckpt(model, loadOpt=False):
+		"""
+		restored checkpointed model
+		Parameters
+			model : torch model
+			loadOpt : True if optimizer to be loaded
+		"""
+		if not model.restored:
+			print("..restoring model checkpoint")
+			modelDirectory = model.config.getStringConfig("common.model.directory")[0]
+			modelFile = model.config.getStringConfig("common.model.file")[0]
+			filepath = os.path.join(modelDirectory, modelFile)
+			assert os.path.exists(filepath), "model save file does not exist"
+			checkpoint = torch.load(filepath)
+			model.load_state_dict(checkpoint["state_dict"])
+			model.to(model.device)
+			if loadOpt:
+				model.optimizer.load_state_dict(checkpoint["optim_dict"])
+			model.restored = True
+	@staticmethod
+	def processClassifOutput(yPred, config):
+		"""
+		extracts probability label 1 or label with highest probability
+		Parameters
+			yPred : predicted output
+			config : config object
+		"""
+		outType = config.getStringConfig("predict.output")[0]
+		if outType == "prob":
+			outputSize = config.getIntConfig("train.output.size")[0]
+			if outputSize == 2:
+				#return prob of pos class for binary classifier
+				yPred = yPred[:, 1]
+			else:
+				#return  class value and probability for multi classifier
+				yCl = np.argmax(yPred, axis=1)
+				yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))
+				yPred = zip(yCl, yPred)
+		else:
+			yPred = np.argmax(yPred, axis=1)
+		return yPred
+	@staticmethod
+	def printPrediction(yPred, config, dataSource):
+		"""
+		prints input feature data and prediction
+		Parameters
+			yPred : predicted output
+			config : config object
+			dataSource : data source str if file path or 2D array
+		"""
+		#prDataFilePath = config.getStringConfig("predict.data.file")[0]
+		padWidth = config.getIntConfig("predict.feat.pad.size")[0]
+		i = 0
+		if type(dataSource) == str:
+			for rec in fileRecGen(dataSource, ","):
+				feat = (",".join(rec)).ljust(padWidth, " ")
+				rec = feat + "\t" + str(yPred[i])
+				print(rec)
+				i += 1
+		else:
+			for rec in dataSource:
+				srec = toStrList(rec, 6)
+				feat = (",".join(srec)).ljust(padWidth, " ")
+				srec = feat + "\t" + str(yPred[i])
+				print(srec)
+				i += 1
+	@staticmethod
+	def allTrain(model):
+		"""
+		train with all data
+		Parameters
+			model : torch model
+		"""
+		# train mode
+		model.train()
+		for t in range(model.numIter):
+			# Forward pass: Compute predicted y by passing x to the model
+			yPred = model(model.featData)
+			# Compute and print loss
+			loss = model.lossFn(yPred, model.outData)
+			if model.verbose and  t % 50 == 0:
+				print("epoch {}  loss {:.6f}".format(t, loss.item()))
+			# Zero gradients, perform a backward pass, and update the weights.
+			model.optimizer.zero_grad()
+			loss.backward()
+			model.optimizer.step()
+		#validate
+		model.eval()
+		yPred = model(model.validFeatData)
+		yPred = yPred.data.cpu().numpy()
+		yActual = model.validOutData
+		if model.verbose:
+			result = np.concatenate((yPred, yActual), axis = 1)
+			print("predicted  actual")
+			print(result)
+		score = perfMetric(model.accMetric, yActual, yPred)
+		print(formatFloat(3, score, "perf score"))
+		return score
+	@staticmethod
+	def batchTrain(model):
+		"""
+		train with batch data
+		Parameters
+			model : torch model
+		"""
+		model.restored = False
+		trainData = TensorDataset(model.featData, model.outData)
+		trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
+		epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
+		# train mode
+		model.train()
+		if model.trackErr:
+			trErr = list()
+			vaErr = list()
+		#epoch
+		for t in range(model.numIter):
+			#batch
+			b = 0
+			epochLoss = 0.0
+			for xBatch, yBatch in trainDataLoader:
+				# Forward pass: Compute predicted y by passing x to the model
+				xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)
+				yPred = model(xBatch)
+				# Compute and print loss
+				loss = model.lossFn(yPred, yBatch)
+				if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:
+					print("epoch {}  batch {}  loss {:.6f}".format(t, b, loss.item()))
+				if model.trackErr and model.batchIntv == 0:
+					epochLoss += loss.item()
+				#error tracking at batch level
+				if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
+					trErr.append(loss.item())
+					vloss = FeedForwardNetwork.evaluateModel(model)
+					vaErr.append(vloss)
+				# Zero gradients, perform a backward pass, and update the weights.
+				model.optimizer.zero_grad()
+				loss.backward()
+				model.optimizer.step()
+				b += 1
+			#error tracking at epoch level
+			if model.trackErr and model.batchIntv == 0:
+				epochLoss /= len(trainDataLoader)
+				trErr.append(epochLoss)
+				vloss = FeedForwardNetwork.evaluateModel(model)
+				vaErr.append(vloss)
+		#validate
+		model.eval()
+		yPred = model(model.validFeatData)
+		yPred = yPred.data.cpu().numpy()
+		yActual = model.validOutData
+		if model.verbose:
+			vsize = yPred.shape[0]
+			print("\npredicted \t\t actual")
+			for i in range(vsize):
+				print(str(yPred[i]) + "\t" + str(yActual[i]))
+		score = perfMetric(model.accMetric, yActual, yPred)
+		print(yActual)
+		print(yPred)
+		print(formatFloat(3, score, "perf score"))
+		#save
+		modelSave = model.config.getBooleanConfig("train.model.save")[0]
+		if modelSave:
+			FeedForwardNetwork.saveCheckpt(model)
+		if model.trackErr:
+			FeedForwardNetwork.errorPlot(model, trErr, vaErr)
+		if model.config.getBooleanConfig("train.print.weights")[0]:
+			print("model weights")
+			for param in model.parameters():
+				print(param.data)
+		return score
+	@staticmethod
+	def errorPlot(model, trErr, vaErr):
+		"""
+		plot errors
+		Parameters
+			trErr : training error list
+			vaErr : validation error list
+		"""
+		x = np.arange(len(trErr))
+		plt.plot(x,trErr,label = "training error")
+		plt.plot(x,vaErr,label = "validation error")
+		plt.xlabel("iteration")
+		plt.ylabel("error")
+		plt.legend(["training error", "validation error"], loc='upper left')
+		plt.show()
+	@staticmethod
+	def modelPredict(model, dataSource = None):
+		"""
+		predict
+		Parameters
+			model : torch model
+			dataSource : data source
+		"""
+		#train or restore model
+		useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
+		if useSavedModel:
+			FeedForwardNetwork.restoreCheckpt(model)
+		else:
+			FeedForwardNetwork.batchTrain(model)
+		#predict
+		if dataSource is None:
+			dataSource = model.config.getStringConfig("predict.data.file")[0]
+		featData  = FeedForwardNetwork.prepData(model, dataSource, False)
+		#print(featData)
+		featData = torch.from_numpy(featData)
+		featData = featData.to(model.device)
+		model.eval()
+		yPred = model(featData)
+		yPred = yPred.data.cpu().numpy()
+		#print(yPred)
+		if model.outputSize >= 2:
+			#classification
+			yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)
+		# print prediction
+		if model.config.getBooleanConfig("predict.print.output")[0]:
+			FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)
+		return yPred
+	def predict(self, dataSource = None):
+		"""
+		predict
+		Parameters
+			dataSource : data source
+		"""
+		return FeedForwardNetwork.modelPredict(self, dataSource)
+	@staticmethod
+	def evaluateModel(model):
+		"""
+		evaluate model
+		Parameters
+			model : torch model
+		"""
+		model.eval()
+		with torch.no_grad():
+			yPred = model(model.validFeatData)
+			#yPred = yPred.data.cpu().numpy()
+			yActual = model.validOutData
+			score = model.lossFn(yPred, yActual).item()
+		model.train()
+		return score
+	@staticmethod
+	def prepValidate(model, dataSource=None):
+		"""
+		prepare for validation
+		Parameters
+			model : torch model
+			dataSource : data source
+		"""
+		#train or restore model
+		if not model.restored:
+			useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
+			if useSavedModel:
+				FeedForwardNetwork.restoreCheckpt(model)
+			else:
+				FeedForwardNetwork.batchTrain(model)
+			model.restored = True
+		if 	dataSource is not None:
+			model.setValidationData(dataSource)
+	@staticmethod
+	def validateModel(model, retPred=False):
+		"""
+		pmodel validation
+		Parameters
+			model : torch model
+			retPred : if True return prediction
+		"""
+		model.eval()
+		yPred = model(model.validFeatData)
+		yPred = yPred.data.cpu().numpy()
+		model.yPred = yPred
+		yActual = model.validOutData
+		vsize = yPred.shape[0]
+		if model.verbose:
+			print("\npredicted \t actual")
+			for i in range(vsize):
+				print("{:.3f}\t\t{:.3f}".format(yPred[i][0], yActual[i][0]))
+		score = perfMetric(model.accMetric, yActual, yPred)
+		print(formatFloat(3, score, "perf score"))
+		if retPred:
+			y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))
+			res = (y, score)
+			return res
+		else:
+			return score