Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

App Files Files Community

Customer-Conversion-Prediction / matumizi /mlutil.py

Priyanka-Kumavat-At-TE

Upload 7 files

e03eaf2 over 1 year ago

raw

history blame contribute delete

No virus

36.1 kB

	#!/usr/local/bin/python3

	# avenir-python: Machine Learning
	# Author: Pranab Ghosh
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you
	# may not use this file except in compliance with the License. You may
	# obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
	# implied. See the License for the specific language governing
	# permissions and limitations under the License.

	# Package imports
	import os
	import sys
	import numpy as np
	from sklearn import preprocessing
	from sklearn import metrics
	from sklearn.datasets import make_blobs
	from sklearn.datasets import make_classification
	import random
	from math import *
	from decimal import Decimal
	import statistics
	import jprops
	from Levenshtein import distance as ld
	from .util import *
	from .sampler import *

	class Configuration:
	"""
	Configuration management. Supports default value, mandatory value and typed value.
	"""
	def __init__(self, configFile, defValues, verbose=False):
	"""
	initializer

	Parameters
	configFile : config file path
	defValues : dictionary of default values
	verbose : verbosity flag
	"""
	configs = {}
	with open(configFile) as fp:
	for key, value in jprops.iter_properties(fp):
	configs[key] = value
	self.configs = configs
	self.defValues = defValues
	self.verbose = verbose

	def override(self, configFile):
	"""
	over ride configuration from file

	Parameters
	configFile : override config file path
	"""
	with open(configFile) as fp:
	for key, value in jprops.iter_properties(fp):
	self.configs[key] = value


	def setParam(self, name, value):
	"""
	override individual configuration

	Parameters
	name : config param name
	value : config param value
	"""
	self.configs[name] = value


	def getStringConfig(self, name):
	"""
	get string param

	Parameters
	name : config param name
	"""
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	val = (self.configs[name], False)
	if self.verbose:
	print( "{} {} {}".format(name, self.configs[name], val[0]))
	return val


	def getIntConfig(self, name):
	"""
	get int param

	Parameters
	name : config param name
	"""
	#print "%s %s" %(name,self.configs[name])
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	val = (int(self.configs[name]), False)
	if self.verbose:
	print( "{} {} {}".format(name, self.configs[name], val[0]))
	return val


	def getFloatConfig(self, name):
	"""
	get float param

	Parameters
	name : config param name
	"""
	#print "%s %s" %(name,self.configs[name])
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	val = (float(self.configs[name]), False)
	if self.verbose:
	print( "{} {} {:06.3f}".format(name, self.configs[name], val[0]))
	return val


	def getBooleanConfig(self, name):
	"""
	#get boolean param

	Parameters
	name : config param name
	"""
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	bVal = self.configs[name].lower() == "true"
	val = (bVal, False)
	if self.verbose:
	print( "{} {} {}".format(name, self.configs[name], val[0]))
	return val


	def getIntListConfig(self, name, delim=","):
	"""
	get int list param

	Parameters
	name : config param name
	delim : delemeter
	"""
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	delSepStr = self.getStringConfig(name)

	#specified as range
	intList = strListOrRangeToIntArray(delSepStr[0])
	val =(intList, delSepStr[1])
	return val

	def getFloatListConfig(self, name, delim=","):
	"""
	get float list param

	Parameters
	name : config param name
	delim : delemeter
	"""
	delSepStr = self.getStringConfig(name)
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	flList = strToFloatArray(delSepStr[0], delim)
	val =(flList, delSepStr[1])
	return val


	def getStringListConfig(self, name, delim=","):
	"""
	get string list param

	Parameters
	name : config param name
	delim : delemeter
	"""
	delSepStr = self.getStringConfig(name)
	if self.isNone(name):
	val = (None, False)
	elif self.isDefault(name):
	val = (self.handleDefault(name), True)
	else:
	strList = delSepStr[0].split(delim)
	val = (strList, delSepStr[1])
	return val

	def handleDefault(self, name):
	"""
	handles default

	Parameters
	name : config param name
	"""
	dVal = self.defValues[name]
	if (dVal[1] is None):
	val = dVal[0]
	else:
	raise ValueError(dVal[1])
	return val


	def isNone(self, name):
	"""
	true is value is None

	Parameters
	name : config param name
	"""
	return self.configs[name].lower() == "none"


	def isDefault(self, name):
	"""
	true if the value is default

	Parameters
	name : config param name
	"""
	de = self.configs[name] == "_"
	#print de
	return de


	def eitherOrStringConfig(self, firstName, secondName):
	"""
	returns one of two string parameters

	Parameters
	firstName : first parameter name
	secondName : second parameter name
	"""
	if not self.isNone(firstName):
	first = self.getStringConfig(firstName)[0]
	second = None
	if not self.isNone(secondName):
	raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName)
	else:
	if not self.isNone(secondName):
	second = self.getStringConfig(secondtName)[0]
	first = None
	else:
	raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName)
	return (first, second)


	def eitherOrIntConfig(self, firstName, secondName):
	"""
	returns one of two int parameters

	Parameters
	firstName : first parameter name
	secondName : second parameter name
	"""
	if not self.isNone(firstName):
	first = self.getIntConfig(firstName)[0]
	second = None
	if not self.isNone(secondName):
	raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName)
	else:
	if not self.isNone(secondName):
	second = self.getIntConfig(secondsName)[0]
	first = None
	else:
	raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName)
	return (first, second)


	class CatLabelGenerator:
	"""
	label generator for categorical variables
	"""
	def __init__(self, catValues, delim):
	"""
	initilizers

	Parameters
	catValues : dictionary of categorical values
	delim : delemeter
	"""
	self.encoders = {}
	self.catValues = catValues
	self.delim = delim
	for k in self.catValues.keys():
	le = preprocessing.LabelEncoder()
	le.fit(self.catValues[k])
	self.encoders[k] = le

	def processRow(self, row):
	"""
	encode row categorical values

	Parameters:
	row : data row
	"""
	#print row
	rowArr = row.split(self.delim)
	for i in range(len(rowArr)):
	if (i in self.catValues):
	curVal = rowArr[i]
	assert curVal in self.catValues[i], "categorival value invalid"
	encVal = self.encoders[i].transform([curVal])
	rowArr[i] = str(encVal[0])
	return self.delim.join(rowArr)

	def getOrigLabels(self, indx):
	"""
	get original labels

	Parameters:
	indx : column index
	"""
	return self.encoders[indx].classes_


	class SupvLearningDataGenerator:
	"""
	data generator for supervised learning
	"""
	def __init__(self, configFile):
	"""
	initilizers

	Parameters
	configFile : config file path
	"""
	defValues = dict()
	defValues["common.num.samp"] = (100, None)
	defValues["common.num.feat"] = (5, None)
	defValues["common.feat.trans"] = (None, None)
	defValues["common.feat.types"] = (None, "missing feature types")
	defValues["common.cat.feat.distr"] = (None, None)
	defValues["common.output.precision"] = (3, None)
	defValues["common.error"] = (0.01, None)
	defValues["class.gen.technique"] = ("blob", None)
	defValues["class.num.feat.informative"] = (2, None)
	defValues["class.num.feat.redundant"] = (2, None)
	defValues["class.num.feat.repeated"] = (0, None)
	defValues["class.num.feat.cat"] = (0, None)
	defValues["class.num.class"] = (2, None)

	self.config = Configuration(configFile, defValues)

	def genClassifierData(self):
	"""
	generates classifier data
	"""
	nsamp = self.config.getIntConfig("common.num.samp")[0]
	nfeat = self.config.getIntConfig("common.num.feat")[0]
	nclass = self.config.getIntConfig("class.num.class")[0]
	#transform with shift and scale
	ftrans = self.config.getFloatListConfig("common.feat.trans")[0]
	feTrans = dict()
	for i in range(0, len(ftrans), 2):
	tr = (ftrans[i], ftrans[i+1])
	indx = int(i/2)
	feTrans[indx] = tr

	ftypes = self.config.getStringListConfig("common.feat.types")[0]

	# categorical feature distribution
	feCatDist = dict()
	fcatdl = self.config.getStringListConfig("common.cat.feat.distr")[0]
	for fcatds in fcatdl:
	fcatd = fcatds.split(":")
	feInd = int(fcatd[0])
	clVal = int(fcatd[1])
	key = (feInd, clVal) #feature index and class value
	dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))
	feCatDist[key] = CategoricalRejectSampler(*dist)

	#shift and scale
	genTechnique = self.config.getStringConfig("class.gen.technique")[0]
	error = self.config.getFloatConfig("common.error")[0]
	if genTechnique == "blob":
	features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)
	for i in range(nsamp): #shift and scale
	for j in range(nfeat):
	tr = feTrans[j]
	features[i,j] = (features[i,j] + tr[0]) * tr[1]
	claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))
	elif genTechnique == "classify":
	nfeatInfo = self.config.getIntConfig("class.num.feat.informative")[0]
	nfeatRed = self.config.getIntConfig("class.num.feat.redundant")[0]
	nfeatRep = self.config.getIntConfig("class.num.feat.repeated")[0]
	shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))
	scales = list(map(lambda i : feTrans[i][1], range(nfeat)))
	features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed,
	n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)
	else:
	raise "invalid genaration technique"

	# add categorical features and format
	nCatFeat = self.config.getIntConfig("class.num.feat.cat")[0]
	prec = self.config.getIntConfig("common.output.precision")[0]
	for f , c in zip(features, claz):
	nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))
	if nCatFeat > 0:
	cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))
	rec = ",".join(nfs) + "," + ",".join(cfs) + "," + str(c)
	else:
	rec = ",".join(nfs) + "," + str(c)
	yield rec

	def numFeToStr(self, fv, ft, prec):
	"""
	nummeric feature value to string

	Parameters
	fv : field value
	ft : field data type
	prec : precision
	"""
	if ft == "float":
	s = formatFloat(prec, fv)
	elif ft =="int":
	s = str(int(fv))
	else:
	raise "invalid type expecting float or int"
	return s

	def catFe(self, i, cv, ft, feCatDist):
	"""
	generate categorical feature

	Parameters
	i : col index
	cv : class value
	ft : field data type
	feCatDist : cat value distribution
	"""
	if ft == "cat":
	key = (i, cv)
	s = feCatDist[key].sample()
	else:
	raise "invalid type expecting categorical"
	return s

	class RegressionDataGenerator:
	"""
	data generator for regression, including square terms, cross terms, bias, noise, correlated variables
	and user defined function
	"""
	def __init__(self, configFile, callback=None):
	"""
	initilizers

	Parameters
	configFile : config file path
	callback : user defined function
	"""
	defValues = dict()
	defValues["common.pvar.samplers"] = (None, None)
	defValues["common.pvar.ranges"] = (None, None)
	defValues["common.linear.weights"] = (None, None)
	defValues["common.square.weights"] = (None, None)
	defValues["common.crterm.weights"] = (None, None)
	defValues["common.corr.params"] = (None, None)
	defValues["common.bias"] = (0, None)
	defValues["common.noise"] = (None, None)
	defValues["common.tvar.range"] = (None, None)
	defValues["common.weight.niter"] = (20, None)
	self.config = Configuration(configFile, defValues)
	self.callback = callback

	#samplers for predictor variables
	items = self.config.getStringListConfig("common.pvar.samplers")[0]
	self.samplers = list(map(lambda s : createSampler(s), items))
	self.npvar = len(self.samplers)

	#values range for predictor variables
	items = self.config.getStringListConfig("common.pvar.ranges")[0]
	self.pvranges = list()
	for i in range(0, len(items), 2):
	if items[i] =="none":
	r = None
	else:
	vmin = float(items[i])
	vmax = float(items[i+1])
	r = (vmin, vmax, vmax-vmin)
	self.pvranges.append(r)
	assertEqual(len(self.pvranges), self.npvar, "no of predicatble var ranges provided is inavalid")


	#linear weights for predictor variables
	self.lweights = self.config.getFloatListConfig("common.linear.weights")[0]
	assertEqual(len(self.lweights), self.npvar, "no of linear weights provided is inavalid")


	#square weights for predictor variables
	items = self.config.getStringListConfig("common.square.weights")[0]
	self.sqweight = dict()
	for i in range(0, len(items), 2):
	vi = int(items[i])
	assertLesser(vi, self.npvar, "invalid predictor var index")
	wt = float(items[i+1])
	self.sqweight[vi] = wt

	#crossterm weights for predictor variables
	items = self.config.getStringListConfig("common.crterm.weights")[0]
	self.crweight = dict()
	for i in range(0, len(items), 3):
	vi = int(items[i])
	assertLesser(vi, self.npvar, "invalid predictor var index")
	vj = int(items[i+1])
	assertLesser(vj, self.npvar, "invalid predictor var index")
	wt = float(items[i+2])
	vp = (vi, vj)
	self.crweight[vp] = wt

	#correlated variables
	items = self.config.getStringListConfig("common.corr.params")[0]
	self.corrparams = dict()
	for co in items:
	cparam = co.split(":")
	vi = int(cparam[0])
	vj = int(cparam[1])
	k = (vi,vj)
	bias = float(cparam[2])
	wt = float(cparam[3])
	noise = float(cparam[4])
	roundoff = cparam[5] == "true"
	v = (bias, wt, noise, roundoff)
	self.corrparams[k] = v


	#boas, noise and target range values
	self.bias = self.config.getFloatConfig("common.bias")[0]
	noise = self.config.getStringListConfig("common.noise")[0]
	self.ndistr = noise[0]
	self.noise = float(noise[1])
	self.tvarlim = self.config.getFloatListConfig("common.tvar.range")[0]

	#sample
	niter = self.config.getIntConfig("common.weight.niter")[0]
	yvals = list()
	for i in range(niter):
	y = self.sample()[1]
	yvals.append(y)

	#scale weights by sampled mean and target mean
	my = statistics.mean(yvals)
	myt =(self.tvarlim[1] - self.tvarlim[0]) / 2
	sc = (myt - self.bias) / (my - self.bias)
	#print("weight scale {:.3f}".format(sc))
	self.lweights = list(map(lambda w : w * sc, self.lweights))
	#print("weights {}".format(toStrFromList(self.lweights, 3)))

	for k in self.sqweight.keys():
	self.sqweight[k] *= sc

	for k in self.crweight.keys():
	self.crweight[k] *= sc


	def sample(self):
	"""
	sample predictor variables and target variable

	"""
	pvd = list(map(lambda s : s.sample(), self.samplers))

	#correct for correlated variables
	for k in self.corrparams.keys():
	vi = k[0]
	vj = k[1]
	v = self.corrparams[k]
	bias = v[0]
	wt = v[1]
	noise = v[2]
	roundoff = v[3]
	nv = bias + wt * pvd[vi]
	pvd[vj] = preturbScalar(nv, noise, "normal")
	if roundoff:
	pvd[vj] = round(pvd[vj])

	spvd = list()
	lsum = self.bias
	for i in range(self.npvar):
	#range limit
	if self.pvranges[i] is not None:
	pvd[i] = rangeLimit(pvd[i], self.pvranges[i][0], self.pvranges[i][1])
	spvd.append(pvd[i])

	#scale
	pvd[i] = scaleMinMaxScaData(pvd[i], self.pvranges[i])
	lsum += self.lweights[i] * pvd[i]

	#square terms
	ssum = 0
	for k in self.sqweight.keys():
	ssum += self.sqweight[k] + pvd[k] * pvd[k]

	#cross terms
	crsum = 0
	for k in self.crweight.keys():
	vi = k[0]
	vj = k[1]
	crsum += self.crweight[k] * pvd[vi] * pvd[vj]

	y = lsum + ssum + crsum
	y = preturbScalar(y, self.noise, self.ndistr)
	if self.callback is not None:
	ufy = self.callback(spvd)
	y += ufy
	r = (spvd, y)
	return r


	def loadDataFile(file, delim, cols, colIndices):
	"""
	loads delim separated file and extracts columns

	Parameters
	file : file path
	delim : delemeter
	cols : columns to use from file
	colIndices ; columns to extract
	"""
	data = np.loadtxt(file, delimiter=delim, usecols=cols)
	extrData = data[:,colIndices]
	return (data, extrData)

	def loadFeatDataFile(file, delim, cols):
	"""
	loads delim separated file and extracts columns

	Parameters
	file : file path
	delim : delemeter
	cols : columns to use from file
	"""
	data = np.loadtxt(file, delimiter=delim, usecols=cols)
	return data

	def extrColumns(arr, columns):
	"""
	extracts columns

	Parameters
	arr : 2D array
	columns : columns
	"""
	return arr[:, columns]

	def subSample(featData, clsData, subSampleRate, withReplacement):
	"""
	subsample feature and class label data

	Parameters
	featData : 2D array of feature data
	clsData : arrray of class labels
	subSampleRate : fraction to be sampled
	withReplacement : true if sampling with replacement
	"""
	sampSize = int(featData.shape[0] * subSampleRate)
	sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)
	sampFeat = featData[sampledIndx]
	sampCls = clsData[sampledIndx]
	return(sampFeat, sampCls)

	def euclideanDistance(x,y):
	"""
	euclidean distance

	Parameters
	x : first vector
	y : second fvector
	"""
	return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))

	def squareRooted(x):
	"""
	square root of sum square

	Parameters
	x : data vector
	"""
	return round(sqrt(sum([a*a for a in x])),3)

	def cosineSimilarity(x,y):
	"""
	cosine similarity

	Parameters
	x : first vector
	y : second fvector
	"""
	numerator = sum(a*b for a,b in zip(x,y))
	denominator = squareRooted(x) * squareRooted(y)
	return round(numerator / float(denominator), 3)

	def cosineDistance(x,y):
	"""
	cosine distance

	Parameters
	x : first vector
	y : second fvector
	"""
	return 1.0 - cosineSimilarity(x,y)

	def manhattanDistance(x,y):
	"""
	manhattan distance

	Parameters
	x : first vector
	y : second fvector
	"""
	return sum(abs(a-b) for a,b in zip(x,y))

	def nthRoot(value, nRoot):
	"""
	nth root

	Parameters
	value : data value
	nRoot : root
	"""
	rootValue = 1/float(nRoot)
	return round (Decimal(value) ** Decimal(rootValue),3)

	def minkowskiDistance(x,y,pValue):
	"""
	minkowski distance

	Parameters
	x : first vector
	y : second fvector
	pValue : power factor
	"""
	return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)

	def jaccardSimilarityX(x,y):
	"""
	jaccard similarity

	Parameters
	x : first vector
	y : second fvector
	"""
	intersectionCardinality = len(set.intersection(*[set(x), set(y)]))
	unionCardinality = len(set.union(*[set(x), set(y)]))
	return intersectionCardinality/float(unionCardinality)

	def jaccardSimilarity(x,y,wx=1.0,wy=1.0):
	"""
	jaccard similarity

	Parameters
	x : first vector
	y : second fvector
	wx : weight for x
	wy : weight for y
	"""
	sx = set(x)
	sy = set(y)
	sxyInt = sx.intersection(sy)
	intCardinality = len(sxyInt)
	sxIntDiff = sx.difference(sxyInt)
	syIntDiff = sy.difference(sxyInt)
	unionCardinality = len(sx.union(sy))
	return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))

	def levenshteinSimilarity(s1, s2):
	"""
	Levenshtein similarity for strings

	Parameters
	sx : first string
	sy : second string
	"""
	assert type(s1) == str and type(s2) == str, "Levenshtein similarity is for string only"
	d = ld(s1,s2)
	#print(d)
	l = max(len(s1),len(s2))
	d = 1.0 - min(d/l, 1.0)
	return d

	def norm(values, po=2):
	"""
	norm

	Parameters
	values : list of values
	po : power
	"""
	no = sum(list(map(lambda v: pow(v,po), values)))
	no = pow(no,1.0/po)
	return list(map(lambda v: v/no, values))

	def createOneHotVec(size, indx = -1):
	"""
	random one hot vector

	Parameters
	size : vector size
	indx : one hot position
	"""
	vec = [0] * size
	s = random.randint(0, size - 1) if indx < 0 else indx
	vec[s] = 1
	return vec

	def createAllOneHotVec(size):
	"""
	create all one hot vectors

	Parameters
	size : vector size and no of vectors
	"""
	vecs = list()
	for i in range(size):
	vec = [0] * size
	vec[i] = 1
	vecs.append(vec)
	return vecs

	def blockShuffle(data, blockSize):
	"""
	block shuffle

	Parameters
	data : list data
	blockSize : block size
	"""
	numBlock = int(len(data) / blockSize)
	remain = len(data) % blockSize
	numBlock += (1 if remain > 0 else 0)
	shuffled = list()
	for i in range(numBlock):
	b = random.randint(0, numBlock-1)
	beg = b * blockSize
	if (b < numBlock-1):
	end = beg + blockSize
	shuffled.extend(data[beg:end])
	else:
	shuffled.extend(data[beg:])
	return shuffled

	def shuffle(data, numShuffle):
	"""
	shuffle data by randonm swapping

	Parameters
	data : list data
	numShuffle : no of pairwise swaps
	"""
	sz = len(data)
	if numShuffle is None:
	numShuffle = int(sz / 2)
	for i in range(numShuffle):
	fi = random.randint(0, sz -1)
	se = random.randint(0, sz -1)
	tmp = data[fi]
	data[fi] = data[se]
	data[se] = tmp

	def randomWalk(size, start, lowStep, highStep):
	"""
	random walk

	Parameters
	size : list data
	start : initial position
	lowStep : step min
	highStep : step max
	"""
	cur = start
	for i in range(size):
	yield cur
	cur += randomFloat(lowStep, highStep)

	def binaryEcodeCategorical(values, value):
	"""
	one hot binary encoding

	Parameters
	values : list of values
	value : value to be replaced with 1
	"""
	size = len(values)
	vec = [0] * size
	for i in range(size):
	if (values[i] == value):
	vec[i] = 1
	return vec

	def createLabeledSeq(inputData, tw):
	"""
	Creates feature, label pair from sequence data, where we have tw number of features followed by output

	Parameters
	values : list containing feature and label
	tw : no of features
	"""
	features = list()
	labels = list()
	l = len(inputDta)
	for i in range(l - tw):
	trainSeq = inputData[i:i+tw]
	trainLabel = inputData[i+tw]
	features.append(trainSeq)
	labels.append(trainLabel)
	return (features, labels)

	def createLabeledSeq(filePath, delim, index, tw):
	"""
	Creates feature, label pair from 1D sequence data in file

	Parameters
	filePath : file path
	delim : delemeter
	index : column index
	tw : no of features
	"""
	seqData = getFileColumnAsFloat(filePath, delim, index)
	return createLabeledSeq(seqData, tw)

	def fromMultDimSeqToTabular(data, inpSize, seqLen):
	"""
	Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)

	Parameters
	data : 2D array
	inpSize : each input size in sequence
	seqLen : sequence length
	"""
	nrow = data.shape[0]
	assert data.shape[1] == inpSize * seqLen, "invalid input size or sequence length"
	return data.reshape(nrow * seqLen, inpSize)

	def fromTabularToMultDimSeq(data, inpSize, seqLen):
	"""
	Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen)

	Parameters
	data : 2D array
	inpSize : each input size in sequence
	seqLen : sequence length
	"""
	nrow = int(data.shape[0] / seqLen)
	assert data.shape[1] == inpSize, "invalid input size"
	return data.reshape(nrow, seqLen * inpSize)

	def difference(data, interval=1):
	"""
	takes difference in time series data

	Parameters
	data :list data
	interval : interval for difference
	"""
	diff = list()
	for i in range(interval, len(data)):
	value = data[i] - data[i - interval]
	diff.append(value)
	return diff

	def normalizeMatrix(data, norm, axis=1):
	"""
	normalized each row of the matrix

	Parameters
	data : 2D data
	nporm : normalization method
	axis : row or column
	"""
	normalized = preprocessing.normalize(data,norm=norm, axis=axis)
	return normalized

	def standardizeMatrix(data, axis=0):
	"""
	standardizes each column of the matrix with mean and std deviation

	Parameters
	data : 2D data
	axis : row or column
	"""
	standardized = preprocessing.scale(data, axis=axis)
	return standardized

	def asNumpyArray(data):
	"""
	converts to numpy array

	Parameters
	data : array
	"""
	return np.array(data)

	def perfMetric(metric, yActual, yPred, clabels=None):
	"""
	predictive model accuracy metric

	Parameters
	metric : accuracy metric
	yActual : actual values array
	yPred : predicted values array
	clabels : class labels
	"""
	if metric == "rsquare":
	score = metrics.r2_score(yActual, yPred)
	elif metric == "mae":
	score = metrics.mean_absolute_error(yActual, yPred)
	elif metric == "mse":
	score = metrics.mean_squared_error(yActual, yPred)
	elif metric == "acc":
	yPred = np.rint(yPred)
	score = metrics.accuracy_score(yActual, yPred)
	elif metric == "mlAcc":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.accuracy_score(yActual, yPred)
	elif metric == "prec":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.precision_score(yActual, yPred)
	elif metric == "rec":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.recall_score(yActual, yPred)
	elif metric == "fone":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.f1_score(yActual, yPred)
	elif metric == "confm":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.confusion_matrix(yActual, yPred)
	elif metric == "clarep":
	yPred = np.argmax(yPred, axis=1)
	score = metrics.classification_report(yActual, yPred)
	elif metric == "bce":
	if clabels is None:
	clabels = [0, 1]
	score = metrics.log_loss(yActual, yPred, labels=clabels)
	elif metric == "ce":
	assert clabels is not None, "labels must be provided"
	score = metrics.log_loss(yActual, yPred, labels=clabels)
	else:
	exitWithMsg("invalid prediction performance metric " + metric)
	return score

	def scaleData(data, method):
	"""
	scales feature data column wise

	Parameters
	data : 2D array
	method : scaling method
	"""
	if method == "minmax":
	scaler = preprocessing.MinMaxScaler()
	data = scaler.fit_transform(data)
	elif method == "zscale":
	data = preprocessing.scale(data)
	else:
	raise ValueError("invalid scaling method")
	return data

	def scaleDataWithParams(data, method, scParams):
	"""
	scales feature data column wise

	Parameters
	data : 2D array
	method : scaling method
	scParams : scaling parameters
	"""
	if method == "minmax":
	data = scaleMinMaxTabData(data, scParams)
	elif method == "zscale":
	raise ValueError("invalid scaling method")
	else:
	raise ValueError("invalid scaling method")
	return data

	def scaleMinMaxScaData(data, minMax):
	"""
	minmax scales scalar data

	Parameters
	data : scalar data
	minMax : min, max and range for each column
	"""
	sd = (data - minMax[0]) / minMax[2]
	return sd


	def scaleMinMaxTabData(tdata, minMax):
	"""
	for tabular scales feature data column wise using min max values for each field

	Parameters
	tdata : 2D array
	minMax : min, max and range for each column
	"""
	stdata = list()
	for r in tdata:
	srdata = list()
	for i, c in enumerate(r):
	sd = (c - minMax[i][0]) / minMax[i][2]
	srdata.append(sd)
	stdata.append(srdata)
	return stdata

	def scaleMinMax(rdata, minMax):
	"""
	scales feature data column wise using min max values for each field

	Parameters
	rdata : data array
	minMax : min, max and range for each column
	"""
	srdata = list()
	for i in range(len(rdata)):
	d = rdata[i]
	sd = (d - minMax[i][0]) / minMax[i][2]
	srdata.append(sd)
	return srdata

	def harmonicNum(n):
	"""
	harmonic number

	Parameters
	n : number
	"""
	h = 0
	for i in range(1, n+1, 1):
	h += 1.0 / i
	return h

	def digammaFun(n):
	"""
	figamma function

	Parameters
	n : number
	"""
	#Euler Mascheroni constant
	ec = 0.577216
	return harmonicNum(n - 1) - ec

	def getDataPartitions(tdata, types, columns = None):
	"""
	partitions data with the given columns and random split point defined with predicates

	Parameters
	tdata : 2D array
	types : data typers
	columns : column indexes
	"""
	(dtypes, cvalues) = extractTypesFromString(types)
	if columns is None:
	ncol = len(data[0])
	columns = list(range(ncol))
	ncol = len(columns)
	#print(columns)

	# partition predicates
	partitions = None
	for c in columns:
	#print(c)
	dtype = dtypes[c]
	pred = list()
	if dtype == "int" or dtype == "float":
	(vmin, vmax) = getColMinMax(tdata, c)
	r = vmax - vmin
	rmin = vmin + .2 * r
	rmax = vmax - .2 * r
	sp = randomFloat(rmin, rmax)
	if dtype == "int":
	sp = int(sp)
	else:
	sp = "{:.3f}".format(sp)
	sp = float(sp)
	pred.append([c, "LT", sp])
	pred.append([c, "GE", sp])
	elif dtype == "cat":
	cv = cvalues[c]
	card = len(cv)
	if card < 3:
	num = 1
	else:
	num = randomInt(1, card - 1)
	sp = selectRandomSubListFromList(cv, num)
	sp = " ".join(sp)
	pred.append([c, "IN", sp])
	pred.append([c, "NOTIN", sp])

	#print(pred)
	if partitions is None:
	partitions = pred.copy()
	#print("initial")
	#print(partitions)
	else:
	#print("extension")
	tparts = list()
	for p in partitions:
	#print(p)
	l1 = p.copy()
	l1.extend(pred[0])
	l2 = p.copy()
	l2.extend(pred[1])
	#print("after extension")
	#print(l1)
	#print(l2)
	tparts.append(l1)
	tparts.append(l2)
	partitions = tparts
	#print("extending")
	#print(partitions)

	#for p in partitions:
	#print(p)
	return partitions

	def genAlmostUniformDistr(size, nswap=50):
	"""
	generate probability distribution

	Parameters
	size : distr size
	nswap : no of mass swaps
	"""
	un = 1.0 / size
	distr = [un] * size
	distr = mutDistr(distr, 0.1 * un, nswap)
	return distr

	def mutDistr(distr, shift, nswap=50):
	"""
	mutates a probability distribution

	Parameters
	distr distribution
	shift : amount of shift for swap
	nswap : no of mass swaps
	"""
	size = len(distr)
	for _ in range(nswap):
	fi = randomInt(0, size -1)
	si = randomInt(0, size -1)
	while fi == si:
	fi = randomInt(0, size -1)
	si = randomInt(0, size -1)

	shift = randomFloat(0, shift)
	t = distr[fi]
	distr[fi] -= shift
	if (distr[fi] < 0):
	distr[fi] = 0.0
	shift = t
	distr[si] += shift
	return distr

	def generateBinDistribution(size, ntrue):
	"""
	generate binary array with some elements set to 1

	Parameters
	size : distr size
	ntrue : no of true values
	"""
	distr = [0] * size
	idxs = selectRandomSubListFromList(list(range(size)), ntrue)
	for i in idxs:
	distr[i] = 1
	return distr

	def mutBinaryDistr(distr, nmut):
	"""
	mutate binary distribution

	Parameters
	distr : distr
	nmut : no of mutations
	"""
	idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)
	for i in idxs:
	distr[i] = distr[i] ^ 1
	return distr

	def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=","):
	"""
	file record generator that superimposes given data in the specified segment of a column

	Parameters
	filePath ; file path
	column : column index
	offset : offset into column values
	seqLen : length of subseq
	modifier : data to be superimposed either list or a sampler object
	precision : floating point precision
	delim : delemeter
	"""
	beg = offset
	end = beg + seqLen
	isList = type(modifier) == list
	i = 0
	for rec in fileRecGen(filePath, delim):
	if i >= beg and i < end:
	va = float(rec[column])
	if isList:
	va += modifier[i - beg]
	else:
	va += modifier.sample()
	rec[column] = formatFloat(precision, va)
	yield delim.join(rec)
	i += 1

	class ShiftedDataGenerator:
	"""
	transforms data for distribution shift
	"""
	def __init__(self, types, tdata, addFact, multFact):
	"""
	initializer

	Parameters
	types data types
	tdata : 2D array
	addFact ; factor for data shift
	multFact ; factor for data scaling
	"""
	(self.dtypes, self.cvalues) = extractTypesFromString(types)

	self.limits = dict()
	for k,v in self.dtypes.items():
	if v == "int" or v == "false":
	(vmax, vmin) = getColMinMax(tdata, k)
	self.limits[k] = vmax - vmin
	self.addMin = - addFact / 2
	self.addMax = addFact / 2
	self.multMin = 1.0 - multFact / 2
	self.multMax = 1.0 + multFact / 2




	def transform(self, tdata):
	"""
	linear transforms data to create distribution shift with random shift and scale

	Parameters
	types : data types
	"""
	transforms = dict()
	for k,v in self.dtypes.items():
	if v == "int" or v == "false":
	shift = randomFloat(self.addMin, self.addMax) * self.limits[k]
	scale = randomFloat(self.multMin, self.multMax)
	trns = (shift, scale)
	transforms[k] = trns
	elif v == "cat":
	transforms[k] = isEventSampled(50)

	ttdata = list()
	for rec in tdata:
	nrec = rec.copy()
	for c in range(len(rec)):
	if c in self.dtypes:
	dtype = self.dtypes[c]
	if dtype == "int" or dtype == "float":
	(shift, scale) = transforms[c]
	nval = shift + rec[c] * scale
	if dtype == "int":
	nrec[c] = int(nval)
	else:
	nrec[c] = nval
	elif dtype == "cat":
	cv = self.cvalues[c]
	if transforms[c]:
	nval = selectOtherRandomFromList(cv, rec[c])
	nrec[c] = nval

	ttdata.append(nrec)

	return ttdata

	def transformSpecified(self, tdata, sshift, scale):
	"""
	linear transforms data to create distribution shift shift specified shift and scale

	Parameters
	types : data types
	sshift : shift factor
	scale : scale factor
	"""
	transforms = dict()
	for k,v in self.dtypes.items():
	if v == "int" or v == "false":
	shift = sshift * self.limits[k]
	trns = (shift, scale)
	transforms[k] = trns
	elif v == "cat":
	transforms[k] = isEventSampled(50)

	ttdata = self.__scaleShift(tdata, transforms)
	return ttdata

	def __scaleShift(self, tdata, transforms):
	"""
	shifts and scales tabular data

	Parameters
	tdata : 2D array
	transforms : transforms to apply
	"""
	ttdata = list()
	for rec in tdata:
	nrec = rec.copy()
	for c in range(len(rec)):
	if c in self.dtypes:
	dtype = self.dtypes[c]
	if dtype == "int" or dtype == "float":
	(shift, scale) = transforms[c]
	nval = shift + rec[c] * scale
	if dtype == "int":
	nrec[c] = int(nval)
	else:
	nrec[c] = nval
	elif dtype == "cat":
	cv = self.cvalues[c]
	if transforms[c]:
	#nval = selectOtherRandomFromList(cv, rec[c])
	#nrec[c] = nval
	pass

	ttdata.append(nrec)
	return ttdata

	class RollingStat(object):
	"""
	stats for rolling windowt
	"""
	def __init__(self, wsize):
	"""
	initializer

	Parameters
	wsize : window size
	"""
	self.window = list()
	self.wsize = wsize
	self.mean = None
	self.sd = None

	def add(self, value):
	"""
	add a value

	Parameters
	value : value to add
	"""
	self.window.append(value)
	if len(self.window) > self.wsize:
	self.window = self.window[1:]

	def getStat(self):
	"""
	get rolling window mean and std deviation
	"""
	assertGreater(len(self.window), 0, "window is empty")
	if len(self.window) == 1:
	self.mean = self.window[0]
	self.sd = 0
	else:
	self.mean = statistics.mean(self.window)
	self.sd = statistics.stdev(self.window, xbar=self.mean)
	re = (self.mean, self.sd)
	return re

	def getSize(self):
	"""
	return window size
	"""
	return len(self.window)