Spaces:
Runtime error
Runtime error
#!/usr/local/bin/python3 | |
# avenir-python: Machine Learning | |
# Author: Pranab Ghosh | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you | |
# may not use this file except in compliance with the License. You may | |
# obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
# Package imports | |
import os | |
import sys | |
import numpy as np | |
from sklearn import preprocessing | |
from sklearn import metrics | |
from sklearn.datasets import make_blobs | |
from sklearn.datasets import make_classification | |
import random | |
from math import * | |
from decimal import Decimal | |
import statistics | |
import jprops | |
from Levenshtein import distance as ld | |
from .util import * | |
from .sampler import * | |
class Configuration: | |
""" | |
Configuration management. Supports default value, mandatory value and typed value. | |
""" | |
def __init__(self, configFile, defValues, verbose=False): | |
""" | |
initializer | |
Parameters | |
configFile : config file path | |
defValues : dictionary of default values | |
verbose : verbosity flag | |
""" | |
configs = {} | |
with open(configFile) as fp: | |
for key, value in jprops.iter_properties(fp): | |
configs[key] = value | |
self.configs = configs | |
self.defValues = defValues | |
self.verbose = verbose | |
def override(self, configFile): | |
""" | |
over ride configuration from file | |
Parameters | |
configFile : override config file path | |
""" | |
with open(configFile) as fp: | |
for key, value in jprops.iter_properties(fp): | |
self.configs[key] = value | |
def setParam(self, name, value): | |
""" | |
override individual configuration | |
Parameters | |
name : config param name | |
value : config param value | |
""" | |
self.configs[name] = value | |
def getStringConfig(self, name): | |
""" | |
get string param | |
Parameters | |
name : config param name | |
""" | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
val = (self.configs[name], False) | |
if self.verbose: | |
print( "{} {} {}".format(name, self.configs[name], val[0])) | |
return val | |
def getIntConfig(self, name): | |
""" | |
get int param | |
Parameters | |
name : config param name | |
""" | |
#print "%s %s" %(name,self.configs[name]) | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
val = (int(self.configs[name]), False) | |
if self.verbose: | |
print( "{} {} {}".format(name, self.configs[name], val[0])) | |
return val | |
def getFloatConfig(self, name): | |
""" | |
get float param | |
Parameters | |
name : config param name | |
""" | |
#print "%s %s" %(name,self.configs[name]) | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
val = (float(self.configs[name]), False) | |
if self.verbose: | |
print( "{} {} {:06.3f}".format(name, self.configs[name], val[0])) | |
return val | |
def getBooleanConfig(self, name): | |
""" | |
#get boolean param | |
Parameters | |
name : config param name | |
""" | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
bVal = self.configs[name].lower() == "true" | |
val = (bVal, False) | |
if self.verbose: | |
print( "{} {} {}".format(name, self.configs[name], val[0])) | |
return val | |
def getIntListConfig(self, name, delim=","): | |
""" | |
get int list param | |
Parameters | |
name : config param name | |
delim : delemeter | |
""" | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
delSepStr = self.getStringConfig(name) | |
#specified as range | |
intList = strListOrRangeToIntArray(delSepStr[0]) | |
val =(intList, delSepStr[1]) | |
return val | |
def getFloatListConfig(self, name, delim=","): | |
""" | |
get float list param | |
Parameters | |
name : config param name | |
delim : delemeter | |
""" | |
delSepStr = self.getStringConfig(name) | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
flList = strToFloatArray(delSepStr[0], delim) | |
val =(flList, delSepStr[1]) | |
return val | |
def getStringListConfig(self, name, delim=","): | |
""" | |
get string list param | |
Parameters | |
name : config param name | |
delim : delemeter | |
""" | |
delSepStr = self.getStringConfig(name) | |
if self.isNone(name): | |
val = (None, False) | |
elif self.isDefault(name): | |
val = (self.handleDefault(name), True) | |
else: | |
strList = delSepStr[0].split(delim) | |
val = (strList, delSepStr[1]) | |
return val | |
def handleDefault(self, name): | |
""" | |
handles default | |
Parameters | |
name : config param name | |
""" | |
dVal = self.defValues[name] | |
if (dVal[1] is None): | |
val = dVal[0] | |
else: | |
raise ValueError(dVal[1]) | |
return val | |
def isNone(self, name): | |
""" | |
true is value is None | |
Parameters | |
name : config param name | |
""" | |
return self.configs[name].lower() == "none" | |
def isDefault(self, name): | |
""" | |
true if the value is default | |
Parameters | |
name : config param name | |
""" | |
de = self.configs[name] == "_" | |
#print de | |
return de | |
def eitherOrStringConfig(self, firstName, secondName): | |
""" | |
returns one of two string parameters | |
Parameters | |
firstName : first parameter name | |
secondName : second parameter name | |
""" | |
if not self.isNone(firstName): | |
first = self.getStringConfig(firstName)[0] | |
second = None | |
if not self.isNone(secondName): | |
raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName) | |
else: | |
if not self.isNone(secondName): | |
second = self.getStringConfig(secondtName)[0] | |
first = None | |
else: | |
raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName) | |
return (first, second) | |
def eitherOrIntConfig(self, firstName, secondName): | |
""" | |
returns one of two int parameters | |
Parameters | |
firstName : first parameter name | |
secondName : second parameter name | |
""" | |
if not self.isNone(firstName): | |
first = self.getIntConfig(firstName)[0] | |
second = None | |
if not self.isNone(secondName): | |
raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName) | |
else: | |
if not self.isNone(secondName): | |
second = self.getIntConfig(secondsName)[0] | |
first = None | |
else: | |
raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName) | |
return (first, second) | |
class CatLabelGenerator: | |
""" | |
label generator for categorical variables | |
""" | |
def __init__(self, catValues, delim): | |
""" | |
initilizers | |
Parameters | |
catValues : dictionary of categorical values | |
delim : delemeter | |
""" | |
self.encoders = {} | |
self.catValues = catValues | |
self.delim = delim | |
for k in self.catValues.keys(): | |
le = preprocessing.LabelEncoder() | |
le.fit(self.catValues[k]) | |
self.encoders[k] = le | |
def processRow(self, row): | |
""" | |
encode row categorical values | |
Parameters: | |
row : data row | |
""" | |
#print row | |
rowArr = row.split(self.delim) | |
for i in range(len(rowArr)): | |
if (i in self.catValues): | |
curVal = rowArr[i] | |
assert curVal in self.catValues[i], "categorival value invalid" | |
encVal = self.encoders[i].transform([curVal]) | |
rowArr[i] = str(encVal[0]) | |
return self.delim.join(rowArr) | |
def getOrigLabels(self, indx): | |
""" | |
get original labels | |
Parameters: | |
indx : column index | |
""" | |
return self.encoders[indx].classes_ | |
class SupvLearningDataGenerator: | |
""" | |
data generator for supervised learning | |
""" | |
def __init__(self, configFile): | |
""" | |
initilizers | |
Parameters | |
configFile : config file path | |
""" | |
defValues = dict() | |
defValues["common.num.samp"] = (100, None) | |
defValues["common.num.feat"] = (5, None) | |
defValues["common.feat.trans"] = (None, None) | |
defValues["common.feat.types"] = (None, "missing feature types") | |
defValues["common.cat.feat.distr"] = (None, None) | |
defValues["common.output.precision"] = (3, None) | |
defValues["common.error"] = (0.01, None) | |
defValues["class.gen.technique"] = ("blob", None) | |
defValues["class.num.feat.informative"] = (2, None) | |
defValues["class.num.feat.redundant"] = (2, None) | |
defValues["class.num.feat.repeated"] = (0, None) | |
defValues["class.num.feat.cat"] = (0, None) | |
defValues["class.num.class"] = (2, None) | |
self.config = Configuration(configFile, defValues) | |
def genClassifierData(self): | |
""" | |
generates classifier data | |
""" | |
nsamp = self.config.getIntConfig("common.num.samp")[0] | |
nfeat = self.config.getIntConfig("common.num.feat")[0] | |
nclass = self.config.getIntConfig("class.num.class")[0] | |
#transform with shift and scale | |
ftrans = self.config.getFloatListConfig("common.feat.trans")[0] | |
feTrans = dict() | |
for i in range(0, len(ftrans), 2): | |
tr = (ftrans[i], ftrans[i+1]) | |
indx = int(i/2) | |
feTrans[indx] = tr | |
ftypes = self.config.getStringListConfig("common.feat.types")[0] | |
# categorical feature distribution | |
feCatDist = dict() | |
fcatdl = self.config.getStringListConfig("common.cat.feat.distr")[0] | |
for fcatds in fcatdl: | |
fcatd = fcatds.split(":") | |
feInd = int(fcatd[0]) | |
clVal = int(fcatd[1]) | |
key = (feInd, clVal) #feature index and class value | |
dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2))) | |
feCatDist[key] = CategoricalRejectSampler(*dist) | |
#shift and scale | |
genTechnique = self.config.getStringConfig("class.gen.technique")[0] | |
error = self.config.getFloatConfig("common.error")[0] | |
if genTechnique == "blob": | |
features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat) | |
for i in range(nsamp): #shift and scale | |
for j in range(nfeat): | |
tr = feTrans[j] | |
features[i,j] = (features[i,j] + tr[0]) * tr[1] | |
claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz))) | |
elif genTechnique == "classify": | |
nfeatInfo = self.config.getIntConfig("class.num.feat.informative")[0] | |
nfeatRed = self.config.getIntConfig("class.num.feat.redundant")[0] | |
nfeatRep = self.config.getIntConfig("class.num.feat.repeated")[0] | |
shifts = list(map(lambda i : feTrans[i][0], range(nfeat))) | |
scales = list(map(lambda i : feTrans[i][1], range(nfeat))) | |
features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, | |
n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales) | |
else: | |
raise "invalid genaration technique" | |
# add categorical features and format | |
nCatFeat = self.config.getIntConfig("class.num.feat.cat")[0] | |
prec = self.config.getIntConfig("common.output.precision")[0] | |
for f , c in zip(features, claz): | |
nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat))) | |
if nCatFeat > 0: | |
cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1))) | |
rec = ",".join(nfs) + "," + ",".join(cfs) + "," + str(c) | |
else: | |
rec = ",".join(nfs) + "," + str(c) | |
yield rec | |
def numFeToStr(self, fv, ft, prec): | |
""" | |
nummeric feature value to string | |
Parameters | |
fv : field value | |
ft : field data type | |
prec : precision | |
""" | |
if ft == "float": | |
s = formatFloat(prec, fv) | |
elif ft =="int": | |
s = str(int(fv)) | |
else: | |
raise "invalid type expecting float or int" | |
return s | |
def catFe(self, i, cv, ft, feCatDist): | |
""" | |
generate categorical feature | |
Parameters | |
i : col index | |
cv : class value | |
ft : field data type | |
feCatDist : cat value distribution | |
""" | |
if ft == "cat": | |
key = (i, cv) | |
s = feCatDist[key].sample() | |
else: | |
raise "invalid type expecting categorical" | |
return s | |
class RegressionDataGenerator: | |
""" | |
data generator for regression, including square terms, cross terms, bias, noise, correlated variables | |
and user defined function | |
""" | |
def __init__(self, configFile, callback=None): | |
""" | |
initilizers | |
Parameters | |
configFile : config file path | |
callback : user defined function | |
""" | |
defValues = dict() | |
defValues["common.pvar.samplers"] = (None, None) | |
defValues["common.pvar.ranges"] = (None, None) | |
defValues["common.linear.weights"] = (None, None) | |
defValues["common.square.weights"] = (None, None) | |
defValues["common.crterm.weights"] = (None, None) | |
defValues["common.corr.params"] = (None, None) | |
defValues["common.bias"] = (0, None) | |
defValues["common.noise"] = (None, None) | |
defValues["common.tvar.range"] = (None, None) | |
defValues["common.weight.niter"] = (20, None) | |
self.config = Configuration(configFile, defValues) | |
self.callback = callback | |
#samplers for predictor variables | |
items = self.config.getStringListConfig("common.pvar.samplers")[0] | |
self.samplers = list(map(lambda s : createSampler(s), items)) | |
self.npvar = len(self.samplers) | |
#values range for predictor variables | |
items = self.config.getStringListConfig("common.pvar.ranges")[0] | |
self.pvranges = list() | |
for i in range(0, len(items), 2): | |
if items[i] =="none": | |
r = None | |
else: | |
vmin = float(items[i]) | |
vmax = float(items[i+1]) | |
r = (vmin, vmax, vmax-vmin) | |
self.pvranges.append(r) | |
assertEqual(len(self.pvranges), self.npvar, "no of predicatble var ranges provided is inavalid") | |
#linear weights for predictor variables | |
self.lweights = self.config.getFloatListConfig("common.linear.weights")[0] | |
assertEqual(len(self.lweights), self.npvar, "no of linear weights provided is inavalid") | |
#square weights for predictor variables | |
items = self.config.getStringListConfig("common.square.weights")[0] | |
self.sqweight = dict() | |
for i in range(0, len(items), 2): | |
vi = int(items[i]) | |
assertLesser(vi, self.npvar, "invalid predictor var index") | |
wt = float(items[i+1]) | |
self.sqweight[vi] = wt | |
#crossterm weights for predictor variables | |
items = self.config.getStringListConfig("common.crterm.weights")[0] | |
self.crweight = dict() | |
for i in range(0, len(items), 3): | |
vi = int(items[i]) | |
assertLesser(vi, self.npvar, "invalid predictor var index") | |
vj = int(items[i+1]) | |
assertLesser(vj, self.npvar, "invalid predictor var index") | |
wt = float(items[i+2]) | |
vp = (vi, vj) | |
self.crweight[vp] = wt | |
#correlated variables | |
items = self.config.getStringListConfig("common.corr.params")[0] | |
self.corrparams = dict() | |
for co in items: | |
cparam = co.split(":") | |
vi = int(cparam[0]) | |
vj = int(cparam[1]) | |
k = (vi,vj) | |
bias = float(cparam[2]) | |
wt = float(cparam[3]) | |
noise = float(cparam[4]) | |
roundoff = cparam[5] == "true" | |
v = (bias, wt, noise, roundoff) | |
self.corrparams[k] = v | |
#boas, noise and target range values | |
self.bias = self.config.getFloatConfig("common.bias")[0] | |
noise = self.config.getStringListConfig("common.noise")[0] | |
self.ndistr = noise[0] | |
self.noise = float(noise[1]) | |
self.tvarlim = self.config.getFloatListConfig("common.tvar.range")[0] | |
#sample | |
niter = self.config.getIntConfig("common.weight.niter")[0] | |
yvals = list() | |
for i in range(niter): | |
y = self.sample()[1] | |
yvals.append(y) | |
#scale weights by sampled mean and target mean | |
my = statistics.mean(yvals) | |
myt =(self.tvarlim[1] - self.tvarlim[0]) / 2 | |
sc = (myt - self.bias) / (my - self.bias) | |
#print("weight scale {:.3f}".format(sc)) | |
self.lweights = list(map(lambda w : w * sc, self.lweights)) | |
#print("weights {}".format(toStrFromList(self.lweights, 3))) | |
for k in self.sqweight.keys(): | |
self.sqweight[k] *= sc | |
for k in self.crweight.keys(): | |
self.crweight[k] *= sc | |
def sample(self): | |
""" | |
sample predictor variables and target variable | |
""" | |
pvd = list(map(lambda s : s.sample(), self.samplers)) | |
#correct for correlated variables | |
for k in self.corrparams.keys(): | |
vi = k[0] | |
vj = k[1] | |
v = self.corrparams[k] | |
bias = v[0] | |
wt = v[1] | |
noise = v[2] | |
roundoff = v[3] | |
nv = bias + wt * pvd[vi] | |
pvd[vj] = preturbScalar(nv, noise, "normal") | |
if roundoff: | |
pvd[vj] = round(pvd[vj]) | |
spvd = list() | |
lsum = self.bias | |
for i in range(self.npvar): | |
#range limit | |
if self.pvranges[i] is not None: | |
pvd[i] = rangeLimit(pvd[i], self.pvranges[i][0], self.pvranges[i][1]) | |
spvd.append(pvd[i]) | |
#scale | |
pvd[i] = scaleMinMaxScaData(pvd[i], self.pvranges[i]) | |
lsum += self.lweights[i] * pvd[i] | |
#square terms | |
ssum = 0 | |
for k in self.sqweight.keys(): | |
ssum += self.sqweight[k] + pvd[k] * pvd[k] | |
#cross terms | |
crsum = 0 | |
for k in self.crweight.keys(): | |
vi = k[0] | |
vj = k[1] | |
crsum += self.crweight[k] * pvd[vi] * pvd[vj] | |
y = lsum + ssum + crsum | |
y = preturbScalar(y, self.noise, self.ndistr) | |
if self.callback is not None: | |
ufy = self.callback(spvd) | |
y += ufy | |
r = (spvd, y) | |
return r | |
def loadDataFile(file, delim, cols, colIndices): | |
""" | |
loads delim separated file and extracts columns | |
Parameters | |
file : file path | |
delim : delemeter | |
cols : columns to use from file | |
colIndices ; columns to extract | |
""" | |
data = np.loadtxt(file, delimiter=delim, usecols=cols) | |
extrData = data[:,colIndices] | |
return (data, extrData) | |
def loadFeatDataFile(file, delim, cols): | |
""" | |
loads delim separated file and extracts columns | |
Parameters | |
file : file path | |
delim : delemeter | |
cols : columns to use from file | |
""" | |
data = np.loadtxt(file, delimiter=delim, usecols=cols) | |
return data | |
def extrColumns(arr, columns): | |
""" | |
extracts columns | |
Parameters | |
arr : 2D array | |
columns : columns | |
""" | |
return arr[:, columns] | |
def subSample(featData, clsData, subSampleRate, withReplacement): | |
""" | |
subsample feature and class label data | |
Parameters | |
featData : 2D array of feature data | |
clsData : arrray of class labels | |
subSampleRate : fraction to be sampled | |
withReplacement : true if sampling with replacement | |
""" | |
sampSize = int(featData.shape[0] * subSampleRate) | |
sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement) | |
sampFeat = featData[sampledIndx] | |
sampCls = clsData[sampledIndx] | |
return(sampFeat, sampCls) | |
def euclideanDistance(x,y): | |
""" | |
euclidean distance | |
Parameters | |
x : first vector | |
y : second fvector | |
""" | |
return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y))) | |
def squareRooted(x): | |
""" | |
square root of sum square | |
Parameters | |
x : data vector | |
""" | |
return round(sqrt(sum([a*a for a in x])),3) | |
def cosineSimilarity(x,y): | |
""" | |
cosine similarity | |
Parameters | |
x : first vector | |
y : second fvector | |
""" | |
numerator = sum(a*b for a,b in zip(x,y)) | |
denominator = squareRooted(x) * squareRooted(y) | |
return round(numerator / float(denominator), 3) | |
def cosineDistance(x,y): | |
""" | |
cosine distance | |
Parameters | |
x : first vector | |
y : second fvector | |
""" | |
return 1.0 - cosineSimilarity(x,y) | |
def manhattanDistance(x,y): | |
""" | |
manhattan distance | |
Parameters | |
x : first vector | |
y : second fvector | |
""" | |
return sum(abs(a-b) for a,b in zip(x,y)) | |
def nthRoot(value, nRoot): | |
""" | |
nth root | |
Parameters | |
value : data value | |
nRoot : root | |
""" | |
rootValue = 1/float(nRoot) | |
return round (Decimal(value) ** Decimal(rootValue),3) | |
def minkowskiDistance(x,y,pValue): | |
""" | |
minkowski distance | |
Parameters | |
x : first vector | |
y : second fvector | |
pValue : power factor | |
""" | |
return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue) | |
def jaccardSimilarityX(x,y): | |
""" | |
jaccard similarity | |
Parameters | |
x : first vector | |
y : second fvector | |
""" | |
intersectionCardinality = len(set.intersection(*[set(x), set(y)])) | |
unionCardinality = len(set.union(*[set(x), set(y)])) | |
return intersectionCardinality/float(unionCardinality) | |
def jaccardSimilarity(x,y,wx=1.0,wy=1.0): | |
""" | |
jaccard similarity | |
Parameters | |
x : first vector | |
y : second fvector | |
wx : weight for x | |
wy : weight for y | |
""" | |
sx = set(x) | |
sy = set(y) | |
sxyInt = sx.intersection(sy) | |
intCardinality = len(sxyInt) | |
sxIntDiff = sx.difference(sxyInt) | |
syIntDiff = sy.difference(sxyInt) | |
unionCardinality = len(sx.union(sy)) | |
return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff)) | |
def levenshteinSimilarity(s1, s2): | |
""" | |
Levenshtein similarity for strings | |
Parameters | |
sx : first string | |
sy : second string | |
""" | |
assert type(s1) == str and type(s2) == str, "Levenshtein similarity is for string only" | |
d = ld(s1,s2) | |
#print(d) | |
l = max(len(s1),len(s2)) | |
d = 1.0 - min(d/l, 1.0) | |
return d | |
def norm(values, po=2): | |
""" | |
norm | |
Parameters | |
values : list of values | |
po : power | |
""" | |
no = sum(list(map(lambda v: pow(v,po), values))) | |
no = pow(no,1.0/po) | |
return list(map(lambda v: v/no, values)) | |
def createOneHotVec(size, indx = -1): | |
""" | |
random one hot vector | |
Parameters | |
size : vector size | |
indx : one hot position | |
""" | |
vec = [0] * size | |
s = random.randint(0, size - 1) if indx < 0 else indx | |
vec[s] = 1 | |
return vec | |
def createAllOneHotVec(size): | |
""" | |
create all one hot vectors | |
Parameters | |
size : vector size and no of vectors | |
""" | |
vecs = list() | |
for i in range(size): | |
vec = [0] * size | |
vec[i] = 1 | |
vecs.append(vec) | |
return vecs | |
def blockShuffle(data, blockSize): | |
""" | |
block shuffle | |
Parameters | |
data : list data | |
blockSize : block size | |
""" | |
numBlock = int(len(data) / blockSize) | |
remain = len(data) % blockSize | |
numBlock += (1 if remain > 0 else 0) | |
shuffled = list() | |
for i in range(numBlock): | |
b = random.randint(0, numBlock-1) | |
beg = b * blockSize | |
if (b < numBlock-1): | |
end = beg + blockSize | |
shuffled.extend(data[beg:end]) | |
else: | |
shuffled.extend(data[beg:]) | |
return shuffled | |
def shuffle(data, numShuffle): | |
""" | |
shuffle data by randonm swapping | |
Parameters | |
data : list data | |
numShuffle : no of pairwise swaps | |
""" | |
sz = len(data) | |
if numShuffle is None: | |
numShuffle = int(sz / 2) | |
for i in range(numShuffle): | |
fi = random.randint(0, sz -1) | |
se = random.randint(0, sz -1) | |
tmp = data[fi] | |
data[fi] = data[se] | |
data[se] = tmp | |
def randomWalk(size, start, lowStep, highStep): | |
""" | |
random walk | |
Parameters | |
size : list data | |
start : initial position | |
lowStep : step min | |
highStep : step max | |
""" | |
cur = start | |
for i in range(size): | |
yield cur | |
cur += randomFloat(lowStep, highStep) | |
def binaryEcodeCategorical(values, value): | |
""" | |
one hot binary encoding | |
Parameters | |
values : list of values | |
value : value to be replaced with 1 | |
""" | |
size = len(values) | |
vec = [0] * size | |
for i in range(size): | |
if (values[i] == value): | |
vec[i] = 1 | |
return vec | |
def createLabeledSeq(inputData, tw): | |
""" | |
Creates feature, label pair from sequence data, where we have tw number of features followed by output | |
Parameters | |
values : list containing feature and label | |
tw : no of features | |
""" | |
features = list() | |
labels = list() | |
l = len(inputDta) | |
for i in range(l - tw): | |
trainSeq = inputData[i:i+tw] | |
trainLabel = inputData[i+tw] | |
features.append(trainSeq) | |
labels.append(trainLabel) | |
return (features, labels) | |
def createLabeledSeq(filePath, delim, index, tw): | |
""" | |
Creates feature, label pair from 1D sequence data in file | |
Parameters | |
filePath : file path | |
delim : delemeter | |
index : column index | |
tw : no of features | |
""" | |
seqData = getFileColumnAsFloat(filePath, delim, index) | |
return createLabeledSeq(seqData, tw) | |
def fromMultDimSeqToTabular(data, inpSize, seqLen): | |
""" | |
Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize) | |
Parameters | |
data : 2D array | |
inpSize : each input size in sequence | |
seqLen : sequence length | |
""" | |
nrow = data.shape[0] | |
assert data.shape[1] == inpSize * seqLen, "invalid input size or sequence length" | |
return data.reshape(nrow * seqLen, inpSize) | |
def fromTabularToMultDimSeq(data, inpSize, seqLen): | |
""" | |
Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) | |
Parameters | |
data : 2D array | |
inpSize : each input size in sequence | |
seqLen : sequence length | |
""" | |
nrow = int(data.shape[0] / seqLen) | |
assert data.shape[1] == inpSize, "invalid input size" | |
return data.reshape(nrow, seqLen * inpSize) | |
def difference(data, interval=1): | |
""" | |
takes difference in time series data | |
Parameters | |
data :list data | |
interval : interval for difference | |
""" | |
diff = list() | |
for i in range(interval, len(data)): | |
value = data[i] - data[i - interval] | |
diff.append(value) | |
return diff | |
def normalizeMatrix(data, norm, axis=1): | |
""" | |
normalized each row of the matrix | |
Parameters | |
data : 2D data | |
nporm : normalization method | |
axis : row or column | |
""" | |
normalized = preprocessing.normalize(data,norm=norm, axis=axis) | |
return normalized | |
def standardizeMatrix(data, axis=0): | |
""" | |
standardizes each column of the matrix with mean and std deviation | |
Parameters | |
data : 2D data | |
axis : row or column | |
""" | |
standardized = preprocessing.scale(data, axis=axis) | |
return standardized | |
def asNumpyArray(data): | |
""" | |
converts to numpy array | |
Parameters | |
data : array | |
""" | |
return np.array(data) | |
def perfMetric(metric, yActual, yPred, clabels=None): | |
""" | |
predictive model accuracy metric | |
Parameters | |
metric : accuracy metric | |
yActual : actual values array | |
yPred : predicted values array | |
clabels : class labels | |
""" | |
if metric == "rsquare": | |
score = metrics.r2_score(yActual, yPred) | |
elif metric == "mae": | |
score = metrics.mean_absolute_error(yActual, yPred) | |
elif metric == "mse": | |
score = metrics.mean_squared_error(yActual, yPred) | |
elif metric == "acc": | |
yPred = np.rint(yPred) | |
score = metrics.accuracy_score(yActual, yPred) | |
elif metric == "mlAcc": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.accuracy_score(yActual, yPred) | |
elif metric == "prec": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.precision_score(yActual, yPred) | |
elif metric == "rec": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.recall_score(yActual, yPred) | |
elif metric == "fone": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.f1_score(yActual, yPred) | |
elif metric == "confm": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.confusion_matrix(yActual, yPred) | |
elif metric == "clarep": | |
yPred = np.argmax(yPred, axis=1) | |
score = metrics.classification_report(yActual, yPred) | |
elif metric == "bce": | |
if clabels is None: | |
clabels = [0, 1] | |
score = metrics.log_loss(yActual, yPred, labels=clabels) | |
elif metric == "ce": | |
assert clabels is not None, "labels must be provided" | |
score = metrics.log_loss(yActual, yPred, labels=clabels) | |
else: | |
exitWithMsg("invalid prediction performance metric " + metric) | |
return score | |
def scaleData(data, method): | |
""" | |
scales feature data column wise | |
Parameters | |
data : 2D array | |
method : scaling method | |
""" | |
if method == "minmax": | |
scaler = preprocessing.MinMaxScaler() | |
data = scaler.fit_transform(data) | |
elif method == "zscale": | |
data = preprocessing.scale(data) | |
else: | |
raise ValueError("invalid scaling method") | |
return data | |
def scaleDataWithParams(data, method, scParams): | |
""" | |
scales feature data column wise | |
Parameters | |
data : 2D array | |
method : scaling method | |
scParams : scaling parameters | |
""" | |
if method == "minmax": | |
data = scaleMinMaxTabData(data, scParams) | |
elif method == "zscale": | |
raise ValueError("invalid scaling method") | |
else: | |
raise ValueError("invalid scaling method") | |
return data | |
def scaleMinMaxScaData(data, minMax): | |
""" | |
minmax scales scalar data | |
Parameters | |
data : scalar data | |
minMax : min, max and range for each column | |
""" | |
sd = (data - minMax[0]) / minMax[2] | |
return sd | |
def scaleMinMaxTabData(tdata, minMax): | |
""" | |
for tabular scales feature data column wise using min max values for each field | |
Parameters | |
tdata : 2D array | |
minMax : min, max and range for each column | |
""" | |
stdata = list() | |
for r in tdata: | |
srdata = list() | |
for i, c in enumerate(r): | |
sd = (c - minMax[i][0]) / minMax[i][2] | |
srdata.append(sd) | |
stdata.append(srdata) | |
return stdata | |
def scaleMinMax(rdata, minMax): | |
""" | |
scales feature data column wise using min max values for each field | |
Parameters | |
rdata : data array | |
minMax : min, max and range for each column | |
""" | |
srdata = list() | |
for i in range(len(rdata)): | |
d = rdata[i] | |
sd = (d - minMax[i][0]) / minMax[i][2] | |
srdata.append(sd) | |
return srdata | |
def harmonicNum(n): | |
""" | |
harmonic number | |
Parameters | |
n : number | |
""" | |
h = 0 | |
for i in range(1, n+1, 1): | |
h += 1.0 / i | |
return h | |
def digammaFun(n): | |
""" | |
figamma function | |
Parameters | |
n : number | |
""" | |
#Euler Mascheroni constant | |
ec = 0.577216 | |
return harmonicNum(n - 1) - ec | |
def getDataPartitions(tdata, types, columns = None): | |
""" | |
partitions data with the given columns and random split point defined with predicates | |
Parameters | |
tdata : 2D array | |
types : data typers | |
columns : column indexes | |
""" | |
(dtypes, cvalues) = extractTypesFromString(types) | |
if columns is None: | |
ncol = len(data[0]) | |
columns = list(range(ncol)) | |
ncol = len(columns) | |
#print(columns) | |
# partition predicates | |
partitions = None | |
for c in columns: | |
#print(c) | |
dtype = dtypes[c] | |
pred = list() | |
if dtype == "int" or dtype == "float": | |
(vmin, vmax) = getColMinMax(tdata, c) | |
r = vmax - vmin | |
rmin = vmin + .2 * r | |
rmax = vmax - .2 * r | |
sp = randomFloat(rmin, rmax) | |
if dtype == "int": | |
sp = int(sp) | |
else: | |
sp = "{:.3f}".format(sp) | |
sp = float(sp) | |
pred.append([c, "LT", sp]) | |
pred.append([c, "GE", sp]) | |
elif dtype == "cat": | |
cv = cvalues[c] | |
card = len(cv) | |
if card < 3: | |
num = 1 | |
else: | |
num = randomInt(1, card - 1) | |
sp = selectRandomSubListFromList(cv, num) | |
sp = " ".join(sp) | |
pred.append([c, "IN", sp]) | |
pred.append([c, "NOTIN", sp]) | |
#print(pred) | |
if partitions is None: | |
partitions = pred.copy() | |
#print("initial") | |
#print(partitions) | |
else: | |
#print("extension") | |
tparts = list() | |
for p in partitions: | |
#print(p) | |
l1 = p.copy() | |
l1.extend(pred[0]) | |
l2 = p.copy() | |
l2.extend(pred[1]) | |
#print("after extension") | |
#print(l1) | |
#print(l2) | |
tparts.append(l1) | |
tparts.append(l2) | |
partitions = tparts | |
#print("extending") | |
#print(partitions) | |
#for p in partitions: | |
#print(p) | |
return partitions | |
def genAlmostUniformDistr(size, nswap=50): | |
""" | |
generate probability distribution | |
Parameters | |
size : distr size | |
nswap : no of mass swaps | |
""" | |
un = 1.0 / size | |
distr = [un] * size | |
distr = mutDistr(distr, 0.1 * un, nswap) | |
return distr | |
def mutDistr(distr, shift, nswap=50): | |
""" | |
mutates a probability distribution | |
Parameters | |
distr distribution | |
shift : amount of shift for swap | |
nswap : no of mass swaps | |
""" | |
size = len(distr) | |
for _ in range(nswap): | |
fi = randomInt(0, size -1) | |
si = randomInt(0, size -1) | |
while fi == si: | |
fi = randomInt(0, size -1) | |
si = randomInt(0, size -1) | |
shift = randomFloat(0, shift) | |
t = distr[fi] | |
distr[fi] -= shift | |
if (distr[fi] < 0): | |
distr[fi] = 0.0 | |
shift = t | |
distr[si] += shift | |
return distr | |
def generateBinDistribution(size, ntrue): | |
""" | |
generate binary array with some elements set to 1 | |
Parameters | |
size : distr size | |
ntrue : no of true values | |
""" | |
distr = [0] * size | |
idxs = selectRandomSubListFromList(list(range(size)), ntrue) | |
for i in idxs: | |
distr[i] = 1 | |
return distr | |
def mutBinaryDistr(distr, nmut): | |
""" | |
mutate binary distribution | |
Parameters | |
distr : distr | |
nmut : no of mutations | |
""" | |
idxs = selectRandomSubListFromList(list(range(len(distr))), nmut) | |
for i in idxs: | |
distr[i] = distr[i] ^ 1 | |
return distr | |
def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=","): | |
""" | |
file record generator that superimposes given data in the specified segment of a column | |
Parameters | |
filePath ; file path | |
column : column index | |
offset : offset into column values | |
seqLen : length of subseq | |
modifier : data to be superimposed either list or a sampler object | |
precision : floating point precision | |
delim : delemeter | |
""" | |
beg = offset | |
end = beg + seqLen | |
isList = type(modifier) == list | |
i = 0 | |
for rec in fileRecGen(filePath, delim): | |
if i >= beg and i < end: | |
va = float(rec[column]) | |
if isList: | |
va += modifier[i - beg] | |
else: | |
va += modifier.sample() | |
rec[column] = formatFloat(precision, va) | |
yield delim.join(rec) | |
i += 1 | |
class ShiftedDataGenerator: | |
""" | |
transforms data for distribution shift | |
""" | |
def __init__(self, types, tdata, addFact, multFact): | |
""" | |
initializer | |
Parameters | |
types data types | |
tdata : 2D array | |
addFact ; factor for data shift | |
multFact ; factor for data scaling | |
""" | |
(self.dtypes, self.cvalues) = extractTypesFromString(types) | |
self.limits = dict() | |
for k,v in self.dtypes.items(): | |
if v == "int" or v == "false": | |
(vmax, vmin) = getColMinMax(tdata, k) | |
self.limits[k] = vmax - vmin | |
self.addMin = - addFact / 2 | |
self.addMax = addFact / 2 | |
self.multMin = 1.0 - multFact / 2 | |
self.multMax = 1.0 + multFact / 2 | |
def transform(self, tdata): | |
""" | |
linear transforms data to create distribution shift with random shift and scale | |
Parameters | |
types : data types | |
""" | |
transforms = dict() | |
for k,v in self.dtypes.items(): | |
if v == "int" or v == "false": | |
shift = randomFloat(self.addMin, self.addMax) * self.limits[k] | |
scale = randomFloat(self.multMin, self.multMax) | |
trns = (shift, scale) | |
transforms[k] = trns | |
elif v == "cat": | |
transforms[k] = isEventSampled(50) | |
ttdata = list() | |
for rec in tdata: | |
nrec = rec.copy() | |
for c in range(len(rec)): | |
if c in self.dtypes: | |
dtype = self.dtypes[c] | |
if dtype == "int" or dtype == "float": | |
(shift, scale) = transforms[c] | |
nval = shift + rec[c] * scale | |
if dtype == "int": | |
nrec[c] = int(nval) | |
else: | |
nrec[c] = nval | |
elif dtype == "cat": | |
cv = self.cvalues[c] | |
if transforms[c]: | |
nval = selectOtherRandomFromList(cv, rec[c]) | |
nrec[c] = nval | |
ttdata.append(nrec) | |
return ttdata | |
def transformSpecified(self, tdata, sshift, scale): | |
""" | |
linear transforms data to create distribution shift shift specified shift and scale | |
Parameters | |
types : data types | |
sshift : shift factor | |
scale : scale factor | |
""" | |
transforms = dict() | |
for k,v in self.dtypes.items(): | |
if v == "int" or v == "false": | |
shift = sshift * self.limits[k] | |
trns = (shift, scale) | |
transforms[k] = trns | |
elif v == "cat": | |
transforms[k] = isEventSampled(50) | |
ttdata = self.__scaleShift(tdata, transforms) | |
return ttdata | |
def __scaleShift(self, tdata, transforms): | |
""" | |
shifts and scales tabular data | |
Parameters | |
tdata : 2D array | |
transforms : transforms to apply | |
""" | |
ttdata = list() | |
for rec in tdata: | |
nrec = rec.copy() | |
for c in range(len(rec)): | |
if c in self.dtypes: | |
dtype = self.dtypes[c] | |
if dtype == "int" or dtype == "float": | |
(shift, scale) = transforms[c] | |
nval = shift + rec[c] * scale | |
if dtype == "int": | |
nrec[c] = int(nval) | |
else: | |
nrec[c] = nval | |
elif dtype == "cat": | |
cv = self.cvalues[c] | |
if transforms[c]: | |
#nval = selectOtherRandomFromList(cv, rec[c]) | |
#nrec[c] = nval | |
pass | |
ttdata.append(nrec) | |
return ttdata | |
class RollingStat(object): | |
""" | |
stats for rolling windowt | |
""" | |
def __init__(self, wsize): | |
""" | |
initializer | |
Parameters | |
wsize : window size | |
""" | |
self.window = list() | |
self.wsize = wsize | |
self.mean = None | |
self.sd = None | |
def add(self, value): | |
""" | |
add a value | |
Parameters | |
value : value to add | |
""" | |
self.window.append(value) | |
if len(self.window) > self.wsize: | |
self.window = self.window[1:] | |
def getStat(self): | |
""" | |
get rolling window mean and std deviation | |
""" | |
assertGreater(len(self.window), 0, "window is empty") | |
if len(self.window) == 1: | |
self.mean = self.window[0] | |
self.sd = 0 | |
else: | |
self.mean = statistics.mean(self.window) | |
self.sd = statistics.stdev(self.window, xbar=self.mean) | |
re = (self.mean, self.sd) | |
return re | |
def getSize(self): | |
""" | |
return window size | |
""" | |
return len(self.window) | |