Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

App Files Files Community

Priyanka-Kumavat-At-TE commited on Apr 18, 2023

Commit

e03eaf2

•

1 Parent(s): 8c99283

Upload 7 files

Browse files

Files changed (7) hide show

matumizi/__init__.py +0 -0
matumizi/daexp.py +3121 -0
matumizi/mcsim.py +552 -0
matumizi/mlutil.py +1500 -0
matumizi/sampler.py +1455 -0
matumizi/stats.py +496 -0
matumizi/util.py +2345 -0

matumizi/__init__.py ADDED Viewed

File without changes

matumizi/daexp.py ADDED Viewed

	@@ -0,0 +1,3121 @@

+#!/usr/local/bin/python3
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import numpy as np
+import pandas as pd
+import sklearn as sk
+from sklearn import preprocessing
+from sklearn import metrics
+import random
+from math import *
+from decimal import Decimal
+import pprint
+from statsmodels.graphics import tsaplots
+from statsmodels.tsa import stattools as stt
+from statsmodels.stats import stattools as sstt
+from sklearn.linear_model import LinearRegression
+from matplotlib import pyplot as plt
+from scipy import stats as sta
+from statsmodels.tsa.seasonal import seasonal_decompose
+import statsmodels.api as sm
+from sklearn.ensemble import IsolationForest
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.svm import OneClassSVM
+from sklearn.covariance import EllipticEnvelope
+from sklearn.mixture import GaussianMixture
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+import hurst
+from .util import *
+from .mlutil import *
+from .sampler import *
+from .stats import *
+"""
+Load  data from a CSV file, data frame, numpy array or list
+Each data set (array like) is given a name while loading
+Perform various data exploration operation refering to the data sets by name
+Save and restore workspace if needed
+"""
+class DataSetMetaData:
+	"""
+	data set meta data
+	"""
+	dtypeNum = 1
+	dtypeCat = 2
+	dtypeBin = 3
+	def __init__(self, dtype):
+		self.notes = list()
+		self.dtype = dtype
+	def addNote(self, note):
+		"""
+		add note
+		"""
+		self.notes.append(note)
+class DataExplorer:
+	"""
+	various data exploration functions
+	"""
+	def __init__(self, verbose=True):
+		"""
+		initialize
+		Parameters
+			verbose : True for verbosity
+		"""
+		self.dataSets = dict()
+		self.metaData = dict()
+		self.pp = pprint.PrettyPrinter(indent=4)
+		self.verbose = verbose
+	def setVerbose(self, verbose):
+		"""
+		sets verbose
+		Parameters
+			verbose : True for verbosity
+		"""
+		self.verbose = verbose
+	def save(self, filePath):
+		"""
+		save checkpoint
+		Parameters
+			filePath : path of file where saved
+		"""
+		self.__printBanner("saving workspace")
+		ws = dict()
+		ws["data"] = self.dataSets
+		ws["metaData"] = self.metaData
+		saveObject(ws, filePath)
+		self.__printDone()
+	def restore(self, filePath):
+		"""
+		restore checkpoint
+		Parameters
+			filePath : path of file from where to store
+		"""
+		self.__printBanner("restoring workspace")
+		ws = restoreObject(filePath)
+		self.dataSets = ws["data"]
+		self.metaData = ws["metaData"]
+		self.__printDone()
+	def queryFileData(self, filePath,  *columns):
+		"""
+		query column data type  from a data file
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("querying column data type from a data frame")
+		lcolumns = list(columns)
+		noHeader = type(lcolumns[0]) ==  int
+		if noHeader:
+			df = pd.read_csv(filePath,  header=None)
+		else:
+			df = pd.read_csv(filePath,  header=0)
+		return self.queryDataFrameData(df,  *columns)
+	def queryDataFrameData(self, df,  *columns):
+		"""
+		query column data type  from a data frame
+		Parameters
+			df : data frame with data
+			columns : indexes followed by column name or column names
+		"""
+		self.__printBanner("querying column data type  from a data frame")
+		columns = list(columns)
+		noHeader = type(columns[0]) ==  int
+		dtypes = list()
+		if noHeader:
+			nCols = int(len(columns) / 2)
+			colIndexes = columns[:nCols]
+			cnames = columns[nCols:]
+			nColsDf = len(df.columns)
+			for i in range(nCols):
+				ci = colIndexes[i]
+				assert ci < nColsDf, "col index {} outside range".format(ci)
+				col = df.loc[ : , ci]
+				dtypes.append(self.getDataType(col))
+		else:
+			cnames = columns
+			for c in columns:
+				col = df[c]
+				dtypes.append(self.getDataType(col))
+		nt = list(zip(cnames, dtypes))
+		result = self.__printResult("columns and data types", nt)
+		return result
+	def getDataType(self, col):
+		"""
+		get data type
+		Parameters
+			col : contains data array like
+		"""
+		if isBinary(col):
+			dtype = "binary"
+		elif  isInteger(col):
+			dtype = "integer"
+		elif  isFloat(col):
+			dtype = "float"
+		elif  isCategorical(col):
+			dtype = "categorical"
+		else:
+			dtype = "mixed"
+		return dtype
+	def addFileNumericData(self,filePath,  *columns):
+		"""
+		add numeric columns from a file
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding numeric columns from a file")
+		self.addFileData(filePath, True, *columns)
+		self.__printDone()
+	def addFileBinaryData(self,filePath,  *columns):
+		"""
+		add binary columns from a file
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding binary columns from a file")
+		self.addFileData(filePath, False, *columns)
+		self.__printDone()
+	def addFileData(self, filePath,  numeric, *columns):
+		"""
+		add columns from a file
+		Parameters
+			filePath : path of file with data
+			numeric : True if numeric False in binary
+			columns : indexes followed by column names or column names
+		"""
+		columns = list(columns)
+		noHeader = type(columns[0]) ==  int
+		if noHeader:
+			df = pd.read_csv(filePath,  header=None)
+		else:
+			df = pd.read_csv(filePath,  header=0)
+		self.addDataFrameData(df, numeric, *columns)
+	def addDataFrameNumericData(self,filePath,  *columns):
+		"""
+		add numeric columns from a data frame
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding numeric columns from a data frame")
+		self.addDataFrameData(filePath, True, *columns)
+	def addDataFrameBinaryData(self,filePath,  *columns):
+		"""
+		add binary columns from a data frame
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding binary columns from a data frame")
+		self.addDataFrameData(filePath, False, *columns)
+	def addDataFrameData(self, df,  numeric, *columns):
+		"""
+		add columns from a data frame
+		Parameters
+			df : data frame with data
+			numeric : True if numeric False in binary
+			columns : indexes followed by column names or column names
+		"""
+		columns = list(columns)
+		noHeader = type(columns[0]) ==  int
+		if noHeader:
+			nCols = int(len(columns) / 2)
+			colIndexes = columns[:nCols]
+			nColsDf = len(df.columns)
+			for i in range(nCols):
+				ci = colIndexes[i]
+				assert ci < nColsDf, "col index {} outside range".format(ci)
+				col = df.loc[ : , ci]
+				if numeric:
+					assert isNumeric(col), "data is not numeric"
+				else:
+					assert isBinary(col), "data is not binary"
+				col = col.to_numpy()
+				cn = columns[i + nCols]
+				dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
+				self.__addDataSet(cn, col, dtype)
+		else:
+			for c in columns:
+				col = df[c]
+				if numeric:
+					assert isNumeric(col), "data is not numeric"
+				else:
+					assert isBinary(col), "data is not binary"
+				col = col.to_numpy()
+				dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
+				self.__addDataSet(c, col, dtype)
+	def __addDataSet(self, dsn, data, dtype):
+		"""
+		add dada set
+		Parameters
+			dsn: data set name
+			data : numpy array data
+		"""
+		self.dataSets[dsn] = data
+		self.metaData[dsn] = DataSetMetaData(dtype)
+	def addListNumericData(self, ds,  name):
+		"""
+		add numeric data from a list
+		Parameters
+			ds : list with data
+			name : name of data set
+		"""
+		self.__printBanner("add numeric data from a list")
+		self.addListData(ds, True,  name)
+		self.__printDone()
+	def addListBinaryData(self, ds, name):
+		"""
+		add binary data from a list
+		Parameters
+			ds : list with data
+			name : name of data set
+		"""
+		self.__printBanner("adding binary data from a list")
+		self.addListData(ds, False,  name)
+		self.__printDone()
+	def addListData(self, ds, numeric,  name):
+		"""
+		adds list data
+		Parameters
+			ds : list with data
+			numeric : True if numeric False in binary
+			name : name of data set
+		"""
+		assert type(ds) == list, "data not a list"
+		if numeric:
+			assert isNumeric(ds), "data is not numeric"
+		else:
+			assert isBinary(ds), "data is not binary"
+		dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
+		self.dataSets[name] = np.array(ds)
+		self.metaData[name] = DataSetMetaData(dtype)
+	def addFileCatData(self, filePath,  *columns):
+		"""
+		add categorical columns from a file
+		Parameters
+			filePath : path of file with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding categorical columns from a file")
+		columns = list(columns)
+		noHeader = type(columns[0]) ==  int
+		if noHeader:
+			df = pd.read_csv(filePath,  header=None)
+		else:
+			df = pd.read_csv(filePath,  header=0)
+		self.addDataFrameCatData(df,  *columns)
+		self.__printDone()
+	def addDataFrameCatData(self, df,  *columns):
+		"""
+		add categorical columns from a data frame
+		Parameters
+			df : data frame with data
+			columns : indexes followed by column names or column names
+		"""
+		self.__printBanner("adding categorical columns from a data frame")
+		columns = list(columns)
+		noHeader = type(columns[0]) ==  int
+		if noHeader:
+			nCols = int(len(columns) / 2)
+			colIndexes = columns[:nCols]
+			nColsDf = len(df.columns)
+			for i in range(nCols):
+				ci = colIndexes[i]
+				assert ci < nColsDf, "col index {} outside range".format(ci)
+				col = df.loc[ : , ci]
+				assert isCategorical(col), "data is not categorical"
+				col = col.tolist()
+				cn = columns[i + nCols]
+				self.__addDataSet(cn, col, DataSetMetaData.dtypeCat)
+		else:
+			for c in columns:
+				col = df[c].tolist()
+				self.__addDataSet(c, col, DataSetMetaData.dtypeCat)
+	def addListCatData(self, ds, name):
+		"""
+		add categorical list data
+		Parameters
+			ds : list with data
+			name : name of data set
+		"""
+		self.__printBanner("adding categorical list data")
+		assert type(ds) == list, "data not a list"
+		assert isCategorical(ds), "data is not categorical"
+		self.__addDataSet(name, ds, DataSetMetaData.dtypeCat)
+		self.__printDone()
+	def remData(self, ds):
+		"""
+		removes data set
+		Parameters
+			ds : data set name
+		"""
+		self.__printBanner("removing data set", ds)
+		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+		self.dataSets.pop(ds)
+		self.metaData.pop(ds)
+		names = self.showNames()
+		self.__printDone()
+		return names
+	def addNote(self, ds, note):
+		"""
+		get data
+		Parameters
+			ds : data set name or list or numpy array with data
+			note: note text
+		"""
+		self.__printBanner("adding note")
+		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+		mdata = self.metaData[ds]
+		mdata.addNote(note)
+		self.__printDone()
+	def getNotes(self, ds):
+		"""
+		get data
+		Parameters
+			ds : data set name or list or numpy array with data
+		"""
+		self.__printBanner("getting notes")
+		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+		mdata = self.metaData[ds]
+		dnotes = mdata.notes
+		if self.verbose:
+			for dn in dnotes:
+				print(dn)
+		return dnotes
+	def getNumericData(self, ds):
+		"""
+		get numeric data
+		Parameters
+			ds : data set name or list or numpy array with data
+		"""
+		if type(ds) == str:
+			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+			assert self.metaData[ds].dtype == DataSetMetaData.dtypeNum, "data set {} is expected to be numerical type for this operation".format(ds)
+			data =   self.dataSets[ds]
+		elif type(ds) == list:
+			assert isNumeric(ds), "data is not numeric"
+			data = np.array(ds)
+		elif type(ds) == np.ndarray:
+			data = ds
+		else:
+			raise "invalid type, expecting data set name, list or ndarray"
+		return data
+	def getCatData(self, ds):
+		"""
+		get categorical data
+		Parameters
+			ds : data set name or list  with data
+		"""
+		if type(ds) == str:
+			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+			assert self.metaData[ds].dtype == DataSetMetaData.dtypeCat, "data set {} is expected to be categorical type for this operation".format(ds)
+			data =   self.dataSets[ds]
+		elif type(ds) == list:
+			assert isCategorical(ds), "data is not categorical"
+			data = ds
+		else:
+			raise "invalid type, expecting data set name or list"
+		return data
+	def getAnyData(self, ds):
+		"""
+		get any data
+		Parameters
+			ds : data set name or list  with data
+		"""
+		if type(ds) == str:
+			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+			data =   self.dataSets[ds]
+		elif type(ds) == list:
+			data = ds
+		else:
+			raise "invalid type, expecting data set name or list"
+		return data
+	def loadCatFloatDataFrame(self, ds1, ds2):
+		"""
+		loads float and cat data into data frame
+		Parameters
+			ds1: data set name or list
+			ds2: data set name or list or numpy array
+		"""
+		data1 = self.getCatData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		df1 = pd.DataFrame(data=data1)
+		df2 = pd.DataFrame(data=data2)
+		df = pd.concat([df1,df2], axis=1)
+		df.columns = range(df.shape[1])
+		return df
+	def showNames(self):
+		"""
+		lists data set names
+		"""
+		self.__printBanner("listing data set names")
+		names = self.dataSets.keys()
+		if self.verbose:
+			print("data sets")
+			for ds in names:
+				print(ds)
+		self.__printDone()
+		return names
+	def plot(self, ds, yscale=None):
+		"""
+		plots data
+		Parameters
+			ds: data set name or list or numpy array
+			yscale: y scale
+		"""
+		self.__printBanner("plotting data", ds)
+		data = self.getNumericData(ds)
+		drawLine(data, yscale)
+	def plotZoomed(self, ds, beg, end, yscale=None):
+		"""
+		plots zoomed data
+		Parameters
+			ds: data set name or list or numpy array
+			beg: begin offset
+			end: end offset
+			yscale: y scale
+		"""
+		self.__printBanner("plotting data", ds)
+		data = self.getNumericData(ds)
+		drawLine(data[beg:end], yscale)
+	def scatterPlot(self, ds1, ds2):
+		"""
+		scatter plots data
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+		"""
+		self.__printBanner("scatter plotting data", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		x = np.arange(1, len(data1)+1, 1)
+		plt.scatter(x, data1 ,color="red")
+		plt.scatter(x, data2 ,color="blue")
+		plt.show()
+	def print(self, ds):
+		"""
+		prunt data
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("printing data", ds)
+		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+		data =   self.dataSets[ds]
+		if self.verbore:
+			print(formatAny(len(data), "size"))
+			print("showing first 50 elements" )
+			print(data[:50])
+	def plotHist(self, ds, cumulative, density, nbins=20):
+		"""
+		plots histogram
+		Parameters
+			ds: data set name or list or numpy array
+			cumulative : True if cumulative
+			density : True to normalize for probability density
+			nbins : no of bins
+		"""
+		self.__printBanner("plotting histogram", ds)
+		data = self.getNumericData(ds)
+		plt.hist(data, bins=nbins, cumulative=cumulative, density=density)
+		plt.show()
+	def isMonotonicallyChanging(self, ds):
+		"""
+		checks if monotonically increasing or decreasing
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("checking  monotonic change", ds)
+		data = self.getNumericData(ds)
+		monoIncreasing = all(list(map(lambda i : data[i] >= data[i-1], range(1, len(data), 1))))
+		monoDecreasing = all(list(map(lambda i : data[i] <= data[i-1], range(1, len(data), 1))))
+		result = self.__printResult("monoIncreasing", monoIncreasing, "monoDecreasing", monoDecreasing)
+		return result
+	def getFreqDistr(self, ds,  nbins=20):
+		"""
+		get histogram
+		Parameters
+			ds: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting histogram", ds)
+		data = self.getNumericData(ds)
+		frequency, lowLimit, binsize, extraPoints = sta.relfreq(data, numbins=nbins)
+		result = self.__printResult("frequency", frequency, "lowLimit", lowLimit, "binsize", binsize, "extraPoints", extraPoints)
+		return result
+	def getCumFreqDistr(self, ds,  nbins=20):
+		"""
+		get cumulative freq distribution
+		Parameters
+			ds: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting cumulative freq distribution", ds)
+		data = self.getNumericData(ds)
+		cumFrequency, lowLimit, binsize, extraPoints = sta.cumfreq(data, numbins=nbins)
+		result = self.__printResult("cumFrequency", cumFrequency, "lowLimit", lowLimit, "binsize", binsize, "extraPoints", extraPoints)
+		return result
+	def getExtremeValue(self, ds,  ensamp, nsamp, polarity, doPlotDistr, nbins=20):
+		"""
+		get extreme values
+		Parameters
+			ds: data set name or list or numpy array
+			ensamp: num of samples for extreme values
+			nsamp: num of samples
+			polarity: max or min
+			doPlotDistr: plot distr
+			nbins: num of bins
+		"""
+		self.__printBanner("getting extreme values", ds)
+		data = self.getNumericData(ds)
+		evalues = list()
+		for _ in range(ensamp):
+			values = selectRandomSubListFromListWithRepl(data, nsamp)
+			if polarity == "max":
+				evalues.append(max(values))
+			else:
+				evalues.append(min(values))
+		if doPlotDistr:
+			plt.hist(evalues, bins=nbins, cumulative=False, density=True)
+			plt.show()
+		result = self.__printResult("extremeValues", evalues)
+		return result
+	def getEntropy(self, ds,  nbins=20):
+		"""
+		get entropy
+		Parameters
+			ds: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting entropy", ds)
+		data = self.getNumericData(ds)
+		result = self.getFreqDistr(data, nbins)
+		entropy = sta.entropy(result["frequency"])
+		result = self.__printResult("entropy", entropy)
+		return result
+	def getRelEntropy(self, ds1,  ds2, nbins=20):
+		"""
+		get relative entropy or KL divergence with both data sets numeric
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting relative entropy or KL divergence", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		result1 = self .getFeqDistr(data1, nbins)
+		freq1  = result1["frequency"]
+		result2 = self .getFeqDistr(data2, nbins)
+		freq2  = result2["frequency"]
+		entropy = sta.entropy(freq1, freq2)
+		result = self.__printResult("relEntropy", entropy)
+		return result
+	def getAnyEntropy(self, ds,  dt, nbins=20):
+		"""
+		get entropy of any data typr numeric or categorical
+		Parameters
+			ds: data set name or list or numpy array
+			dt : data type num or cat
+			nbins: num of bins
+		"""
+		entropy = self.getEntropy(ds, nbins)["entropy"] if dt == "num" else self.getStatsCat(ds)["entropy"]
+		result = self.__printResult("entropy", entropy)
+		return result
+	def getJointEntropy(self, ds1, ds2, nbins=20):
+		"""
+		get joint entropy with both data sets numeric
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting join entropy", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		hist, xedges, yedges = np.histogram2d(data1, data2, bins=nbins)
+		hist = hist.flatten()
+		ssize = len(data1)
+		hist = hist / ssize
+		entropy = sta.entropy(hist)
+		result = self.__printResult("jointEntropy", entropy)
+		return result
+	def getAllNumMutualInfo(self, ds1,  ds2, nbins=20):
+		"""
+		get mutual information for both numeric data
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			nbins: num of bins
+		"""
+		self.__printBanner("getting mutual information", ds1, ds2)
+		en1 = self.getEntropy(ds1,nbins)
+		en2 = self.getEntropy(ds2,nbins)
+		en = self.getJointEntropy(ds1, ds2, nbins)
+		mutInfo = en1["entropy"] + en2["entropy"] - en["jointEntropy"]
+		result = self.__printResult("mutInfo", mutInfo)
+		return result
+	def getNumCatMutualInfo(self, nds, cds ,nbins=20):
+		"""
+		get mutiual information between numeric and categorical data
+		Parameters
+			nds: numeric data set name or list or numpy array
+			cds: categoric data set name or list
+			nbins: num of bins
+		"""
+		self.__printBanner("getting mutual information of numerical and categorical data", nds, cds)
+		ndata = self.getNumericData(nds)
+		cds = self.getCatData(cds)
+		nentr = self.getEntropy(nds)["entropy"]
+		#conditional entropy
+		cdistr = self.getStatsCat(cds)["distr"]
+		grdata = self.getGroupByData(nds, cds, True)["groupedData"]
+		cnentr = 0
+		for gr, data in grdata.items():
+			self.addListNumericData(data, "grdata")
+			gnentr = self.getEntropy("grdata")["entropy"]
+			cnentr += gnentr * cdistr[gr]
+		mutInfo = nentr - cnentr
+		result = self.__printResult("mutInfo", mutInfo, "entropy", nentr, "condEntropy", cnentr)
+		return result
+	def getTwoCatMutualInfo(self, cds1, cds2):
+		"""
+		get mutiual information between 2 categorical data sets
+		Parameters
+			cds1 : categoric data set name or list
+			cds2 : categoric data set name or list
+		"""
+		self.__printBanner("getting mutual information of two categorical data sets", cds1, cds2)
+		cdata1 = self.getCatData(cds1)
+		cdata2 = self.getCatData(cds1)
+		centr = self.getStatsCat(cds1)["entropy"]
+		#conditional entropy
+		cdistr = self.getStatsCat(cds2)["distr"]
+		grdata = self.getGroupByData(cds1, cds2, True)["groupedData"]
+		ccentr = 0
+		for gr, data in grdata.items():
+			self.addListCatData(data, "grdata")
+			gcentr = self.getStatsCat("grdata")["entropy"]
+			ccentr += gcentr * cdistr[gr]
+		mutInfo = centr - ccentr
+		result = self.__printResult("mutInfo", mutInfo, "entropy", centr, "condEntropy", ccentr)
+		return result
+	def getMutualInfo(self, dst, nbins=20):
+		"""
+		get mutiual information between 2 data sets,any combination numerical and categorical
+		Parameters
+			dst : data source , data type, data source , data type
+			nbins : num of bins
+		"""
+		assertEqual(len(dst), 4, "invalid data source and data type list size")
+		dtypes = ["num", "cat"]
+		assertInList(dst[1], dtypes, "invalid data type")
+		assertInList(dst[3], dtypes, "invalid data type")
+		self.__printBanner("getting mutual information of any mix numerical and categorical data", dst[0], dst[2])
+		if dst[1] == "num":
+			mutInfo = self.getAllNumMutualInfo(dst[0], dst[2], nbins)["mutInfo"] if dst[3] == "num" \
+			else self.getNumCatMutualInfo(dst[0], dst[2], nbins)["mutInfo"]
+		else:
+			mutInfo = self.getNumCatMutualInfo(dst[2], dst[0], nbins)["mutInfo"] if dst[3] == "num" \
+			else self.getTwoCatMutualInfo(dst[2], dst[0])["mutInfo"]
+		result = self.__printResult("mutInfo", mutInfo)
+		return result
+	def getCondMutualInfo(self, dst, nbins=20):
+		"""
+		get conditional  mutiual information between 2 data sets,any combination numerical and categorical
+		Parameters
+			dst : data source , data type, data source , data type, data source , data type
+			nbins : num of bins
+		"""
+		assertEqual(len(dst), 6, "invalid data source and data type list size")
+		dtypes = ["num", "cat"]
+		assertInList(dst[1], dtypes, "invalid data type")
+		assertInList(dst[3], dtypes, "invalid data type")
+		assertInList(dst[5], dtypes, "invalid data type")
+		self.__printBanner("getting conditional mutual information of any mix numerical and categorical data", dst[0], dst[2])
+		if dst[5] == "cat":
+			cdistr = self.getStatsCat(dst[4])["distr"]
+			grdata1 = self.getGroupByData(dst[0], dst[4], True)["groupedData"]
+			grdata2 = self.getGroupByData(dst[2], dst[4], True)["groupedData"]
+		else:
+			gdata = self.getNumericData(dst[4])
+			hist = Histogram.createWithNumBins(gdata, nbins)
+			cdistr = hist.distr()
+			grdata1 = self.getGroupByData(dst[0], dst[4], False)["groupedData"]
+			grdata2 = self.getGroupByData(dst[2], dst[4], False)["groupedData"]
+		cminfo = 0
+		for gr in grdata1.keys():
+			data1 = grdata1[gr]
+			data2 = grdata2[gr]
+			if dst[1] == "num":
+				self.addListNumericData(data1, "grdata1")
+			else:
+				self.addListCatData(data1, "grdata1")
+			if dst[3] == "num":
+				self.addListNumericData(data2, "grdata2")
+			else:
+				self.addListCatData(data2, "grdata2")
+			gdst = ["grdata1", dst[1], "grdata2", dst[3]]
+			minfo = self.getMutualInfo(gdst, nbins)["mutInfo"]
+			cminfo += minfo * cdistr[gr]
+		result = self.__printResult("condMutInfo", cminfo)
+		return result
+	def getPercentile(self, ds, value):
+		"""
+		gets percentile
+		Parameters
+			ds: data set name or list or numpy array
+			value: the value
+		"""
+		self.__printBanner("getting percentile", ds)
+		data = self.getNumericData(ds)
+		percent = sta.percentileofscore(data, value)
+		result = self.__printResult("value", value, "percentile", percent)
+		return result
+	def getValueRangePercentile(self, ds, value1, value2):
+		"""
+		gets percentile
+		Parameters
+			ds: data set name or list or numpy array
+			value1: first value
+			value2: second value
+		"""
+		self.__printBanner("getting percentile difference for value range", ds)
+		if value1 < value2:
+			v1 = value1
+			v2 = value2
+		else:
+			v1 = value2
+			v2 = value1
+		data = self.getNumericData(ds)
+		per1 = sta.percentileofscore(data, v1)
+		per2 = sta.percentileofscore(data, v2)
+		result = self.__printResult("valueFirst", value1, "valueSecond", value2, "percentileDiff", per2 - per1)
+		return result
+	def getValueAtPercentile(self, ds, percent):
+		"""
+		gets value at percentile
+		Parameters
+			ds: data set name or list or numpy array
+			percent: percentile
+		"""
+		self.__printBanner("getting value at percentile", ds)
+		data = self.getNumericData(ds)
+		assert isInRange(percent, 0, 100), "percent should be between 0 and 100"
+		value = sta.scoreatpercentile(data, percent)
+		result = self.__printResult("value", value, "percentile", percent)
+		return result
+	def getLessThanValues(self, ds, cvalue):
+		"""
+		gets values less than given value
+		Parameters
+			ds: data set name or list or numpy array
+			cvalue: condition value
+		"""
+		self.__printBanner("getting values less than", ds)
+		fdata = self.__getCondValues(ds, cvalue, "lt")
+		result = self.__printResult("count", len(fdata),  "lessThanvalues", fdata )
+		return result
+	def getGreaterThanValues(self, ds, cvalue):
+		"""
+		gets values greater than given value
+		Parameters
+			ds: data set name or list or numpy array
+			cvalue: condition value
+		"""
+		self.__printBanner("getting values greater than", ds)
+		fdata = self.__getCondValues(ds, cvalue, "gt")
+		result = self.__printResult("count", len(fdata), "greaterThanvalues", fdata )
+		return result
+	def __getCondValues(self, ds, cvalue, cond):
+		"""
+		gets cinditional values
+		Parameters
+			ds: data set name or list or numpy array
+			cvalue: condition value
+			cond: condition
+		"""
+		data = self.getNumericData(ds)
+		if cond == "lt":
+			ind = np.where(data < cvalue)
+		else:
+			ind = np.where(data > cvalue)
+		fdata = data[ind]
+		return fdata
+	def getUniqueValueCounts(self, ds, maxCnt=10):
+		"""
+		gets unique values and counts
+		Parameters
+			ds: data set name or list or numpy array
+			maxCnt; max value count pairs to return
+		"""
+		self.__printBanner("getting unique values and counts", ds)
+		data = self.getNumericData(ds)
+		values, counts = sta.find_repeats(data)
+		cardinality = len(values)
+		vc = list(zip(values, counts))
+		vc.sort(key = lambda v : v[1], reverse = True)
+		result = self.__printResult("cardinality", cardinality,  "vunique alues and repeat counts", vc[:maxCnt])
+		return result
+	def getCatUniqueValueCounts(self, ds, maxCnt=10):
+		"""
+		gets unique categorical values and counts
+		Parameters
+			ds: data set name or list or numpy array
+			maxCnt: max value count pairs to return
+		"""
+		self.__printBanner("getting unique categorical values and counts", ds)
+		data = self.getCatData(ds)
+		series = pd.Series(data)
+		uvalues = series.value_counts()
+		values = uvalues.index.tolist()
+		counts = uvalues.tolist()
+		vc = list(zip(values, counts))
+		vc.sort(key = lambda v : v[1], reverse = True)
+		result = self.__printResult("cardinality", len(values),  "unique values and repeat counts", vc[:maxCnt])
+		return result
+	def getCatAlphaValueCounts(self, ds):
+		"""
+		gets alphabetic value count
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting alphabetic value counts", ds)
+		data = self.getCatData(ds)
+		series = pd.Series(data)
+		flags = series.str.isalpha().tolist()
+		count = sum(flags)
+		result = self.__printResult("alphabeticValueCount", count)
+		return result
+	def getCatNumValueCounts(self, ds):
+		"""
+		gets numeric value count
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting numeric value counts", ds)
+		data = self.getCatData(ds)
+		series = pd.Series(data)
+		flags = series.str.isnumeric().tolist()
+		count = sum(flags)
+		result = self.__printResult("numericValueCount", count)
+		return result
+	def getCatAlphaNumValueCounts(self, ds):
+		"""
+		gets alpha numeric value count
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting alpha numeric value counts", ds)
+		data = self.getCatData(ds)
+		series = pd.Series(data)
+		flags = series.str.isalnum().tolist()
+		count = sum(flags)
+		result = self.__printResult("alphaNumericValueCount", count)
+		return result
+	def getCatAllCharCounts(self, ds):
+		"""
+		gets alphabetic, numeric and special char count list
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting alphabetic, numeric and special  char counts", ds)
+		data = self.getCatData(ds)
+		counts = list()
+		for d in data:
+			r = getAlphaNumCharCount(d)
+			counts.append(r)
+		result = self.__printResult("allTypeCharCounts", counts)
+		return result
+	def getCatAlphaCharCounts(self, ds):
+		"""
+		gets alphabetic char count list
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting alphabetic char counts", ds)
+		data = self.getCatData(ds)
+		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
+		counts = list(map(lambda r : r[0], counts))
+		result = self.__printResult("alphaCharCounts", counts)
+		return result
+	def getCatNumCharCounts(self, ds):
+		"""
+		gets numeric char count list
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting numeric char counts", ds)
+		data = self.getCatData(ds)
+		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
+		counts = list(map(lambda r : r[1], counts))
+		result = self.__printResult("numCharCounts", counts)
+		return result
+	def getCatSpecialCharCounts(self, ds):
+		"""
+		gets special char count list
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting special char counts", ds)
+		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
+		counts = list(map(lambda r : r[2], counts))
+		result = self.__printResult("specialCharCounts", counts)
+		return result
+	def getCatAlphaCharCountStats(self, ds):
+		"""
+		gets alphabetic char count stats
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting alphabetic char count stats", ds)
+		counts = self.getCatAlphaCharCounts(ds)["alphaCharCounts"]
+		nz = counts.count(0)
+		st = self.__getBasicStats(np.array(counts))
+		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
+		return result
+	def getCatNumCharCountStats(self, ds):
+		"""
+		gets numeric char count stats
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting numeric char count stats", ds)
+		counts = self.getCatNumCharCounts(ds)["numCharCounts"]
+		nz = counts.count(0)
+		st = self.__getBasicStats(np.array(counts))
+		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
+		return result
+	def getCatSpecialCharCountStats(self, ds):
+		"""
+		gets special char count stats
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting special char count stats", ds)
+		counts = self.getCatSpecialCharCounts(ds)["specialCharCounts"]
+		nz = counts.count(0)
+		st = self.__getBasicStats(np.array(counts))
+		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
+		return result
+	def getCatFldLenStats(self, ds):
+		"""
+		gets field length stats
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting field length stats", ds)
+		data = self.getCatData(ds)
+		le = list(map(lambda d: len(d), data))
+		st = self.__getBasicStats(np.array(le))
+		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3])
+		return result
+	def getCatCharCountStats(self, ds, ch):
+		"""
+		gets specified char ocuurence count stats
+		Parameters
+			ds: data set name or list or numpy array
+			ch : character
+		"""
+		self.__printBanner("getting field length stats", ds)
+		data = self.getCatData(ds)
+		counts = list(map(lambda d: d.count(ch), data))
+		nz = counts.count(0)
+		st = self.__getBasicStats(np.array(counts))
+		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
+		return result
+	def getStats(self, ds, nextreme=5):
+		"""
+		gets summary statistics
+		Parameters
+			ds: data set name or list or numpy array
+			nextreme: num of extreme values
+		"""
+		self.__printBanner("getting summary statistics", ds)
+		data = self.getNumericData(ds)
+		stat = dict()
+		stat["length"] = len(data)
+		stat["min"] = data.min()
+		stat["max"] = data.max()
+		series = pd.Series(data)
+		stat["n smallest"] = series.nsmallest(n=nextreme).tolist()
+		stat["n largest"] = series.nlargest(n=nextreme).tolist()
+		stat["mean"] = data.mean()
+		stat["median"] = np.median(data)
+		mode, modeCnt = sta.mode(data)
+		stat["mode"] = mode[0]
+		stat["mode count"] = modeCnt[0]
+		stat["std"] = np.std(data)
+		stat["skew"] = sta.skew(data)
+		stat["kurtosis"] = sta.kurtosis(data)
+		stat["mad"] = sta.median_absolute_deviation(data)
+		self.pp.pprint(stat)
+		return stat
+	def getStatsCat(self, ds):
+		"""
+		gets summary statistics for categorical data
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting summary statistics for categorical data", ds)
+		data = self.getCatData(ds)
+		ch = CatHistogram()
+		for d in data:
+			ch.add(d)
+		mode = ch.getMode()
+		entr = ch.getEntropy()
+		uvalues = ch.getUniqueValues()
+		distr = ch.getDistr()
+		result = self.__printResult("entropy", entr, "mode", mode, "uniqueValues", uvalues, "distr", distr)
+		return result
+	def getGroupByData(self, ds, gds, gdtypeCat, numBins=20):
+		"""
+		group by
+		Parameters
+			ds: data set name or list or numpy array
+			gds: group by data set name or list or numpy array
+			gdtpe : group by data type
+		"""
+		self.__printBanner("getting group by data", ds)
+		data = self.getAnyData(ds)
+		if gdtypeCat:
+			gdata = self.getCatData(gds)
+		else:
+			gdata = self.getNumericData(gds)
+			hist = Histogram.createWithNumBins(gdata, numBins)
+			gdata = list(map(lambda d : hist.bin(d), gdata))
+		self.ensureSameSize([data, gdata])
+		groups = dict()
+		for g,d in zip(gdata, data):
+			appendKeyedList(groups, g, d)
+		ve = self.verbose
+		self.verbose = False
+		result = self.__printResult("groupedData", groups)
+		self.verbose = ve
+		return result
+	def getDifference(self, ds, order, doPlot=False):
+		"""
+		gets difference of given order
+		Parameters
+			ds: data set name or list or numpy array
+			order: order of difference
+			doPlot : True for plot
+		"""
+		self.__printBanner("getting difference of given order", ds)
+		data = self.getNumericData(ds)
+		diff = difference(data, order)
+		if doPlot:
+			drawLine(diff)
+		return diff
+	def getTrend(self, ds, doPlot=False):
+		"""
+		get trend
+		Parameters
+			ds: data set name or list or numpy array
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("getting trend")
+		data = self.getNumericData(ds)
+		sz = len(data)
+		X = list(range(0, sz))
+		X = np.reshape(X, (sz, 1))
+		model = LinearRegression()
+		model.fit(X, data)
+		trend = model.predict(X)
+		sc = model.score(X, data)
+		coef = model.coef_
+		intc = model.intercept_
+		result = self.__printResult("coeff", coef, "intercept", intc,  "r square error", sc,  "trend", trend)
+		if doPlot:
+			plt.plot(data)
+			plt.plot(trend)
+			plt.show()
+		return result
+	def getDiffSdNoisiness(self, ds):
+		"""
+		get noisiness based on std dev of first order difference
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		diff = self.getDifference(ds, 1)
+		noise = np.std(np.array(diff))
+		result = self.__printResult("noisiness", noise)
+		return result
+	def getMaRmseNoisiness(self, ds, wsize=5):
+		"""
+		gets noisiness based on RMSE with moving average
+		Parameters
+			ds: data set name or list or numpy array
+			wsize : window size
+		"""
+		assert wsize % 2 == 1, "window size must be odd"
+		data = self.getNumericData(ds)
+		wind = data[:wsize]
+		wstat = SlidingWindowStat.initialize(wind.tolist())
+		whsize = int(wsize / 2)
+		beg = whsize
+		end = len(data) - whsize - 1
+		sumSq = 0.0
+		mean = wstat.getStat()[0]
+		diff = data[beg] - mean
+		sumSq += diff * diff
+		for i in range(beg + 1, end, 1):
+			mean = wstat.addGetStat(data[i + whsize])[0]
+			diff = data[i] - mean
+			sumSq += (diff * diff)
+		noise = math.sqrt(sumSq / (len(data) - 2 * whsize))
+		result = self.__printResult("noisiness", noise)
+		return result
+	def deTrend(self, ds, trend, doPlot=False):
+		"""
+		de trend
+		Parameters
+			ds: data set name or list or numpy array
+			ternd : trend data
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("doing de trend", ds)
+		data = self.getNumericData(ds)
+		sz = len(data)
+		detrended =  list(map(lambda i : data[i]-trend[i], range(sz)))
+		if doPlot:
+			drawLine(detrended)
+		return detrended
+	def getTimeSeriesComponents(self, ds, model, freq, summaryOnly, doPlot=False):
+		"""
+		extracts trend, cycle and residue components of time series
+		Parameters
+			ds: data set name or list or numpy array
+			model : model type
+			freq : seasnality period
+			summaryOnly : True if only summary needed in output
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("extracting trend, cycle and residue components of time series", ds)
+		assert model == "additive" or model == "multiplicative", "model must be additive or multiplicative"
+		data = self.getNumericData(ds)
+		res = seasonal_decompose(data, model=model, period=freq)
+		if doPlot:
+			res.plot()
+			plt.show()
+		#summar of componenets
+		trend = np.array(removeNan(res.trend))
+		trendMean = trend.mean()
+		trendSlope = (trend[-1] - trend[0]) / (len(trend) - 1)
+		seasonal = np.array(removeNan(res.seasonal))
+		seasonalAmp = (seasonal.max() - seasonal.min()) / 2
+		resid = np.array(removeNan(res.resid))
+		residueMean = resid.mean()
+		residueStdDev = np.std(resid)
+		if summaryOnly:
+			result = self.__printResult("trendMean", trendMean, "trendSlope", trendSlope, "seasonalAmp", seasonalAmp,
+			"residueMean", residueMean, "residueStdDev", residueStdDev)
+		else:
+			result = self.__printResult("trendMean", trendMean, "trendSlope", trendSlope, "seasonalAmp", seasonalAmp,
+			"residueMean", residueMean, "residueStdDev", residueStdDev, "trend", res.trend, "seasonal", res.seasonal,
+			"residual", res.resid)
+		return result
+	def getGausianMixture(self, ncomp, cvType, ninit, *dsl):
+		"""
+		finds gaussian mixture parameters
+		Parameters
+			ncomp : num of gaussian componenets
+			cvType : co variance type
+			ninit: num of intializations
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting gaussian mixture parameters", *dsl)
+		assertInList(cvType, ["full", "tied", "diag", "spherical"], "invalid covariance type")
+		dmat = self.__stackData(*dsl)
+		gm = GaussianMixture(n_components=ncomp,  covariance_type=cvType, n_init=ninit)
+		gm.fit(dmat)
+		weights = gm.weights_
+		means = gm.means_
+		covars = gm.covariances_
+		converged = gm.converged_
+		niter = gm.n_iter_
+		aic = gm.aic(dmat)
+		result = self.__printResult("weights", weights, "mean", means, "covariance", covars, "converged", converged, "num iterations", niter, "aic", aic)
+		return result
+	def getKmeansCluster(self, nclust, ninit, *dsl):
+		"""
+		gets cluster parameters
+		Parameters
+			nclust : num of clusters
+			ninit: num of intializations
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting kmean cluster parameters", *dsl)
+		dmat = self.__stackData(*dsl)
+		nsamp = dmat.shape[0]
+		km = KMeans(n_clusters=nclust, n_init=ninit)
+		km.fit(dmat)
+		centers = km.cluster_centers_
+		avdist = sqrt(km.inertia_ / nsamp)
+		niter = km.n_iter_
+		score = km.score(dmat)
+		result = self.__printResult("centers", centers, "average distance", avdist, "num iterations", niter, "score", score)
+		return result
+	def getPrincComp(self, ncomp, *dsl):
+		"""
+		finds pricipal componenet parameters
+		Parameters
+			ncomp : num of pricipal componenets
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting principal componenet parameters", *dsl)
+		dmat = self.__stackData(*dsl)
+		nfeat = dmat.shape[1]
+		assertGreater(nfeat, 1, "requires multiple features")
+		assertLesserEqual(ncomp, nfeat, "num of componenets greater than num of features")
+		pca = PCA(n_components=ncomp)
+		pca.fit(dmat)
+		comps = pca.components_
+		var = pca.explained_variance_
+		varr = pca.explained_variance_ratio_
+		svalues = pca.singular_values_
+		result = self.__printResult("componenets", comps, "variance", var, "variance ratio", varr, "singular values", svalues)
+		return result
+	def getOutliersWithIsoForest(self, contamination,  *dsl):
+		"""
+		finds outliers using isolation forest
+		Parameters
+			contamination : proportion of outliers in the data set
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting outliers using isolation forest", *dsl)
+		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
+		dmat = self.__stackData(*dsl)
+		isf = IsolationForest(contamination=contamination, behaviour="new")
+		ypred = isf.fit_predict(dmat)
+		mask = ypred == -1
+		doul = dmat[mask, :]
+		mask = ypred != -1
+		dwoul = dmat[mask, :]
+		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
+		return result
+	def getOutliersWithLocalFactor(self, contamination,  *dsl):
+		"""
+		gets outliers using local outlier factor
+		Parameters
+			contamination : proportion of outliers in the data set
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting outliers using local outlier factor", *dsl)
+		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
+		dmat = self.__stackData(*dsl)
+		lof = LocalOutlierFactor(contamination=contamination)
+		ypred = lof.fit_predict(dmat)
+		mask = ypred == -1
+		doul = dmat[mask, :]
+		mask = ypred != -1
+		dwoul = dmat[mask, :]
+		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
+		return result
+	def getOutliersWithSupVecMach(self, nu,  *dsl):
+		"""
+		gets outliers using one class svm
+		Parameters
+			nu : upper bound on the fraction of training errors and a lower bound of the fraction of support vectors
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting outliers using one class svm", *dsl)
+		assert nu >= 0 and nu <= 0.5, "error upper bound outside valid range"
+		dmat = self.__stackData(*dsl)
+		svm = OneClassSVM(nu=nu)
+		ypred = svm.fit_predict(dmat)
+		mask = ypred == -1
+		doul = dmat[mask, :]
+		mask = ypred != -1
+		dwoul = dmat[mask, :]
+		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
+		return result
+	def getOutliersWithCovarDeterminant(self, contamination,  *dsl):
+		"""
+		gets outliers using covariance determinan
+		Parameters
+			contamination : proportion of outliers in the data set
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting outliers using using covariance determinant", *dsl)
+		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
+		dmat = self.__stackData(*dsl)
+		lof = EllipticEnvelope(contamination=contamination)
+		ypred = lof.fit_predict(dmat)
+		mask = ypred == -1
+		doul = dmat[mask, :]
+		mask = ypred != -1
+		dwoul = dmat[mask, :]
+		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
+		return result
+	def getOutliersWithZscore(self, ds, zthreshold, stats=None):
+		"""
+		gets outliers using zscore
+		Parameters
+			ds: data set name or list or numpy array
+			zthreshold : z score threshold
+			stats : tuple cintaining mean and std dev
+		"""
+		self.__printBanner("getting outliers using zscore", ds)
+		data = self.getNumericData(ds)
+		if stats is None:
+			mean = data.mean()
+			sd = np.std(data)
+		else:
+			mean = stats[0]
+			sd = stats[1]
+		zs = list(map(lambda d : abs((d - mean) / sd), data))
+		outliers = list(filter(lambda r : r[1] > zthreshold, enumerate(zs)))
+		result = self.__printResult("outliers", outliers)
+		return result
+	def getOutliersWithRobustZscore(self, ds, zthreshold, stats=None):
+		"""
+		gets outliers using robust zscore
+		Parameters
+			ds: data set name or list or numpy array
+			zthreshold : z score threshold
+			stats : tuple containing median and median absolute deviation
+		"""
+		self.__printBanner("getting outliers using robust zscore", ds)
+		data = self.getNumericData(ds)
+		if stats is None:
+			med = np.median(data)
+			dev = np.array(list(map(lambda d : abs(d - med), data)))
+			mad = 1.4296 *  np.median(dev)
+		else:
+			med = stats[0]
+			mad = stats[1]
+		rzs = list(map(lambda d : abs((d - med) / mad), data))
+		outliers = list(filter(lambda r : r[1] > zthreshold, enumerate(rzs)))
+		result = self.__printResult("outliers", outliers)
+		return result
+	def getSubsequenceOutliersWithDissimilarity(self, subSeqSize, ds):
+		"""
+		gets subsequence outlier with subsequence pairwise disimilarity
+		Parameters
+			subSeqSize : sub sequence size
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("doing sub sequence anomaly detection with dissimilarity", ds)
+		data = self.getNumericData(ds)
+		sz = len(data)
+		dist = dict()
+		minDist = dict()
+		for i in range(sz - subSeqSize):
+			#first window
+			w1 = data[i : i + subSeqSize]
+			dmin = None
+			for j in range(sz - subSeqSize):
+				#second window not overlapping with the first
+				if j + subSeqSize <=i or j >= i + subSeqSize:
+					w2 = data[j : j + subSeqSize]
+					k = (j,i)
+					if k in dist:
+						d = dist[k]
+					else:
+						d = euclideanDistance(w1,w2)
+						k = (i,j)
+						dist[k] = d
+					if dmin is None:
+						dmin = d
+					else:
+						dmin = d if d < dmin else dmin
+			minDist[i] = dmin
+		#find max of min
+		dmax = None
+		offset = None
+		for k in minDist.keys():
+			d = minDist[k]
+			if dmax is None:
+				dmax = d
+				offset = k
+			else:
+				if d > dmax:
+					dmax = d
+					offset = k
+		result = self.__printResult("subSeqOffset", offset, "outlierScore", dmax)
+		return result
+	def getNullCount(self, ds):
+		"""
+		get count of null fields
+		Parameters
+			ds : data set name or list or numpy array with data
+		"""
+		self.__printBanner("getting null value count", ds)
+		if type(ds) == str:
+			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
+			data =  self.dataSets[ds]
+			ser = pd.Series(data)
+		elif type(ds) == list or type(ds) == np.ndarray:
+			ser = pd.Series(ds)
+			data = ds
+		else:
+			raise ValueError("invalid data type")
+		nv = ser.isnull().tolist()
+		nullCount = nv.count(True)
+		nullFraction = nullCount / len(data)
+		result = self.__printResult("nullFraction", nullFraction, "nullCount", nullCount)
+		return result
+	def fitLinearReg(self, dsx, ds, doPlot=False):
+		"""
+		fit  linear regression
+		Parameters
+			dsx: x data set name or None
+			ds: data set name or list or numpy array
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("fitting linear regression", ds)
+		data = self.getNumericData(ds)
+		if dsx is None:
+			x = np.arange(len(data))
+		else:
+			x = self.getNumericData(dsx)
+		slope, intercept, rvalue, pvalue, stderr = sta.linregress(x, data)
+		result = self.__printResult("slope", slope, "intercept", intercept, "rvalue", rvalue, "pvalue", pvalue, "stderr", stderr)
+		if doPlot:
+			self.regFitPlot(x, data, slope, intercept)
+		return result
+	def fitSiegelRobustLinearReg(self, ds, doPlot=False):
+		"""
+		siegel robust linear regression fit based on median
+		Parameters
+			ds: data set name or list or numpy array
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("fitting siegel robust linear regression  based on median", ds)
+		data = self.getNumericData(ds)
+		slope , intercept = sta.siegelslopes(data)
+		result = self.__printResult("slope", slope, "intercept", intercept)
+		if doPlot:
+			x = np.arange(len(data))
+			self.regFitPlot(x, data, slope, intercept)
+		return result
+	def fitTheilSenRobustLinearReg(self, ds, doPlot=False):
+		"""
+		thiel sen  robust linear fit regression based on median
+		Parameters
+			ds: data set name or list or numpy array
+			doPlot: true if plotting needed
+		"""
+		self.__printBanner("fitting thiel sen  robust linear regression based on median", ds)
+		data = self.getNumericData(ds)
+		slope, intercept, loSlope, upSlope = sta.theilslopes(data)
+		result = self.__printResult("slope", slope, "intercept", intercept, "lower slope", loSlope, "upper slope", upSlope)
+		if doPlot:
+			x = np.arange(len(data))
+			self.regFitPlot(x, data, slope, intercept)
+		return result
+	def plotRegFit(self, x, y, slope, intercept):
+		"""
+		plot linear rgeression fit line
+		Parameters
+			x : x values
+			y : y values
+			slope : slope
+			intercept : intercept
+		"""
+		self.__printBanner("plotting linear rgeression fit line")
+		fig = plt.figure()
+		ax = fig.add_subplot(111)
+		ax.plot(x, y, "b.")
+		ax.plot(x, intercept + slope * x, "r-")
+		plt.show()
+	def getRegFit(self, xvalues, yvalues, slope, intercept):
+		"""
+		gets fitted line and residue
+		Parameters
+			x : x values
+			y : y values
+			slope : regression slope
+			intercept : regressiob intercept
+		"""
+		yfit = list()
+		residue = list()
+		for x,y in zip(xvalues, yvalues):
+			yf = x * slope + intercept
+			yfit.append(yf)
+			r = y - yf
+			residue.append(r)
+		result = self.__printResult("fitted line", yfit, "residue", residue)
+		return result
+	def getInfluentialPoints(self, dsx, dsy):
+		"""
+		gets influential points in regression model with Cook's distance
+		Parameters
+			dsx : data set name or list or numpy array for x
+			dsy : data set name or list or numpy array for y
+		"""
+		self.__printBanner("finding influential points for linear regression", dsx, dsy)
+		y = self.getNumericData(dsy)
+		x = np.arange(len(data)) if dsx is None else self.getNumericData(dsx)
+		model = sm.OLS(y, x).fit()
+		np.set_printoptions(suppress=True)
+		influence = model.get_influence()
+		cooks = influence.cooks_distance
+		result = self.__printResult("Cook distance", cooks)
+		return result
+	def getCovar(self, *dsl):
+		"""
+		gets covariance
+		Parameters
+			dsl: list of data set name or list or numpy array
+		"""
+		self.__printBanner("getting covariance", *dsl)
+		data = list(map(lambda ds : self.getNumericData(ds), dsl))
+		self.ensureSameSize(data)
+		data = np.vstack(data)
+		cv = np.cov(data)
+		print(cv)
+		return cv
+	def getPearsonCorr(self, ds1, ds2, sigLev=.05):
+		"""
+		gets pearson correlation coefficient
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+		"""
+		self.__printBanner("getting pearson correlation coefficient ", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		stat, pvalue = sta.pearsonr(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def getSpearmanRankCorr(self, ds1, ds2, sigLev=.05):
+		"""
+		gets spearman correlation coefficient
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("getting spearman correlation coefficient",ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		stat, pvalue = sta.spearmanr(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def getKendalRankCorr(self, ds1, ds2, sigLev=.05):
+		"""
+		kendall’s tau, a correlation measure for ordinal data
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("getting kendall’s tau, a correlation measure for ordinal data", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		stat, pvalue = sta.kendalltau(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def getPointBiserialCorr(self, ds1, ds2, sigLev=.05):
+		"""
+		point biserial  correlation  between binary and numeric
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("getting point biserial correlation  between binary and numeric", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		assert isBinary(data1), "first data set is not binary"
+		self.ensureSameSize([data1, data2])
+		stat, pvalue = sta.pointbiserialr(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def getConTab(self, ds1, ds2):
+		"""
+		get contingency table for categorical data pair
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+		"""
+		self.__printBanner("getting contingency table for categorical data", ds1, ds2)
+		data1 = self.getCatData(ds1)
+		data2 = self.getCatData(ds2)
+		self.ensureSameSize([data1, data2])
+		crosstab = pd.crosstab(pd.Series(data1), pd.Series(data2), margins = False)
+		ctab = crosstab.values
+		print("contingency table")
+		print(ctab)
+		return ctab
+	def getChiSqCorr(self, ds1, ds2, sigLev=.05):
+		"""
+		chi square correlation for  categorical	data pair
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("getting chi square correlation for  two categorical", ds1, ds2)
+		ctab = self.getConTab(ds1, ds2)
+		stat, pvalue, dof, expctd = sta.chi2_contingency(ctab)
+		result = self.__printResult("stat", stat, "pvalue", pvalue, "dof", dof, "expected", expctd)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def getSizeCorrectChiSqCorr(self, ds1, ds2, chisq):
+		"""
+		cramerV size corrected chi square correlation for  categorical	data pair
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			chisq: chisq stat
+		"""
+		self.__printBanner("getting size corrected chi square correlation for  two categorical", ds1, ds2)
+		c1 = self.getCatUniqueValueCounts(ds1)["cardinality"]
+		c2 = self.getCatUniqueValueCounts(ds2)["cardinality"]
+		c = min(c1,c2)
+		assertGreater(c, 1, "min cardinality should be greater than 1")
+		l = len(self.getCatData(ds1))
+		t = l * (c - 1)
+		stat = math.sqrt(chisq / t)
+		result = self.__printResult("stat", stat)
+		return result
+	def getAnovaCorr(self, ds1, ds2, grByCol, sigLev=.05):
+		"""
+		anova correlation for  numerical categorical
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			grByCol : group by column
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("anova correlation for numerical categorical", ds1, ds2)
+		df = self.loadCatFloatDataFrame(ds1, ds2) if grByCol == 0 else self.loadCatFloatDataFrame(ds2, ds1)
+		grByCol = 0
+		dCol = 1
+		grouped = df.groupby([grByCol])
+		dlist =  list(map(lambda v : v[1].loc[:, dCol].values, grouped))
+		stat, pvalue = sta.f_oneway(*dlist)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
+		return result
+	def plotAutoCorr(self, ds, lags, alpha, diffOrder=0):
+		"""
+		plots auto correlation
+		Parameters
+			ds: data set name or list or numpy array
+			lags: num of lags
+			alpha: confidence level
+		"""
+		self.__printBanner("plotting auto correlation", ds)
+		data = self.getNumericData(ds)
+		ddata = difference(data, diffOrder) if diffOrder > 0 else data
+		tsaplots.plot_acf(ddata, lags = lags, alpha = alpha)
+		plt.show()
+	def getAutoCorr(self, ds, lags, alpha=.05):
+		"""
+		gets auts correlation
+		Parameters
+			ds: data set name or list or numpy array
+			lags: num of lags
+			alpha: confidence level
+		"""
+		self.__printBanner("getting auto correlation", ds)
+		data = self.getNumericData(ds)
+		autoCorr, confIntv  = stt.acf(data, nlags=lags, fft=False, alpha=alpha)
+		result = self.__printResult("autoCorr", autoCorr, "confIntv", confIntv)
+		return result
+	def plotParAcf(self, ds, lags, alpha):
+		"""
+		partial auto correlation
+		Parameters
+			ds: data set name or list or numpy array
+			lags: num of lags
+			alpha: confidence level
+		"""
+		self.__printBanner("plotting partial auto correlation", ds)
+		data = self.getNumericData(ds)
+		tsaplots.plot_pacf(data, lags = lags, alpha = alpha)
+		plt.show()
+	def getParAutoCorr(self, ds, lags, alpha=.05):
+		"""
+		gets partial auts correlation
+		Parameters
+			ds: data set name or list or numpy array
+			lags: num of lags
+			alpha: confidence level
+		"""
+		self.__printBanner("getting partial auto correlation", ds)
+		data = self.getNumericData(ds)
+		partAutoCorr, confIntv  = stt.pacf(data, nlags=lags, alpha=alpha)
+		result = self.__printResult("partAutoCorr", partAutoCorr, "confIntv", confIntv)
+		return result
+	def getHurstExp(self, ds, kind, doPlot=True):
+		"""
+		gets Hurst exponent of time series
+		Parameters
+			ds: data set name or list or numpy array
+			kind: kind of data change, random_walk, price
+			doPlot: True for plot
+		"""
+		self.__printBanner("getting Hurst exponent", ds)
+		data = self.getNumericData(ds)
+		h, c, odata = hurst.compute_Hc(data, kind=kind, simplified=False)
+		if doPlot:
+			f, ax = plt.subplots()
+			ax.plot(odata[0], c * odata[0] ** h, color="deepskyblue")
+			ax.scatter(odata[0], odata[1], color="purple")
+			ax.set_xscale("log")
+			ax.set_yscale("log")
+			ax.set_xlabel("time interval")
+			ax.set_ylabel("cum dev range and std dev ratio")
+			ax.grid(True)
+			plt.show()
+		result = self.__printResult("hurstExponent", h, "hurstConstant", c)
+		return result
+	def approxEntropy(self, ds, m, r):
+		"""
+		gets apprx entroty of time series (ref: wikipedia)
+		Parameters
+			ds: data set name or list or numpy array
+			m:  length of compared run of data
+			r: filtering level
+		"""
+		self.__printBanner("getting approximate entropy", ds)
+		ldata = self.getNumericData(ds)
+		aent = abs(self.__phi(ldata, m + 1, r) - self.__phi(ldata, m, r))
+		result = self.__printResult("approxEntropy", aent)
+		return result
+	def __phi(self, ldata, m, r):
+		"""
+		phi function for approximate entropy
+		Parameters
+			ldata: data array
+			m:  length of compared run of data
+			r: filtering level
+		"""
+		le = len(ldata)
+		x = [[ldata[j] for j in range(i, i + m - 1 + 1)] for i in range(le - m + 1)]
+		lex = len(x)
+		c = list()
+		for i in range(lex):
+			cnt = 0
+			for j in range(lex):
+				cnt += (1 if maxListDist(x[i], x[j]) <= r else 0)
+			cnt /= (le - m + 1.0)
+			c.append(cnt)
+		return sum(np.log(c)) / (le - m + 1.0)
+	def oneSpaceEntropy(self, ds, scaMethod="zscale"):
+		"""
+		gets one space  entroty  (ref:  Estimating mutual information by Kraskov)
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting one space entropy", ds)
+		data = self.getNumericData(ds)
+		sdata = sorted(data)
+		sdata = scaleData(sdata, scaMethod)
+		su = 0
+		n = len(sdata)
+		for i in range(1, n, 1):
+			t = abs(sdata[i] - sdata[i-1])
+			if t > 0:
+				su += log(t)
+		su /= (n -1)
+		#print(su)
+		ose = digammaFun(n) - digammaFun(1) + su
+		result = self.__printResult("entropy", ose)
+		return result
+	def plotCrossCorr(self, ds1, ds2, normed, lags):
+		"""
+		plots cross correlation
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			normed: If True, input vectors are normalised to unit
+			lags: num of lags
+		"""
+		self.__printBanner("plotting cross correlation between two numeric", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		plt.xcorr(data1, data2, normed=normed, maxlags=lags)
+		plt.show()
+	def getCrossCorr(self, ds1, ds2):
+		"""
+		gets cross correlation
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+		"""
+		self.__printBanner("getting cross correlation", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		self.ensureSameSize([data1, data2])
+		crossCorr = stt.ccf(data1, data2)
+		result = self.__printResult("crossCorr", crossCorr)
+		return result
+	def getFourierTransform(self, ds):
+		"""
+		gets fast fourier transform
+		Parameters
+			ds: data set name or list or numpy array
+		"""
+		self.__printBanner("getting fourier transform", ds)
+		data = self.getNumericData(ds)
+		ft = np.fft.rfft(data)
+		result = self.__printResult("fourierTransform", ft)
+		return result
+	def testStationaryAdf(self, ds, regression, autolag, sigLev=.05):
+		"""
+		Adf stationary test null hyp not stationary
+		Parameters
+			ds: data set name or list or numpy array
+			regression: constant and trend order to include in regression
+			autolag: method to use when automatically determining the lag
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing ADF stationary test", ds)
+		relist = ["c","ct","ctt","nc"]
+		assert regression in relist, "invalid regression value"
+		alList = ["AIC", "BIC", "t-stat", None]
+		assert autolag in alList, "invalid autolag value"
+		data = self.getNumericData(ds)
+		re = stt.adfuller(data, regression=regression, autolag=autolag)
+		result = self.__printResult("stat", re[0], "pvalue", re[1] , "num lags", re[2] , "num observation for regression", re[3],
+		"critial values", re[4])
+		self.__printStat(re[0], re[1], "probably not stationary", "probably stationary", sigLev)
+		return result
+	def testStationaryKpss(self, ds, regression, nlags, sigLev=.05):
+		"""
+		Kpss stationary test null hyp  stationary
+		Parameters
+			ds: data set name or list or numpy array
+			regression: constant and trend order to include in regression
+			nlags : no of lags
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing KPSS stationary test", ds)
+		relist = ["c","ct"]
+		assert regression in relist, "invalid regression value"
+		nlList =[None, "auto", "legacy"]
+		assert nlags in nlList or type(nlags) == int, "invalid nlags value"
+		data = self.getNumericData(ds)
+		stat, pvalue, nLags, criticalValues = stt.kpss(data, regression=regression, lags=nlags)
+		result = self.__printResult("stat", stat, "pvalue", pvalue, "num lags", nLags, "critial values", criticalValues)
+		self.__printStat(stat, pvalue, "probably stationary", "probably not stationary", sigLev)
+		return result
+	def testNormalJarqBera(self, ds, sigLev=.05):
+		"""
+		jarque bera normalcy test
+		Parameters
+			ds: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing ajrque bera normalcy test", ds)
+		data = self.getNumericData(ds)
+		jb, jbpv, skew, kurtosis =  sstt.jarque_bera(data)
+		result = self.__printResult("stat", jb, "pvalue", jbpv, "skew", skew, "kurtosis", kurtosis)
+		self.__printStat(jb, jbpv, "probably gaussian", "probably not gaussian", sigLev)
+		return result
+	def testNormalShapWilk(self, ds, sigLev=.05):
+		"""
+		shapiro wilks normalcy test
+		Parameters
+			ds: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing shapiro wilks normalcy test", ds)
+		data = self.getNumericData(ds)
+		stat, pvalue = sta.shapiro(data)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably gaussian", "probably not gaussian", sigLev)
+		return result
+	def testNormalDagast(self, ds, sigLev=.05):
+		"""
+		D’Agostino’s K square  normalcy test
+		Parameters
+			ds: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing D’Agostino’s K square  normalcy test", ds)
+		data = self.getNumericData(ds)
+		stat, pvalue = sta.normaltest(data)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably gaussian", "probably not gaussian", sigLev)
+		return result
+	def testDistrAnderson(self, ds, dist, sigLev=.05):
+		"""
+		Anderson test for normal, expon, logistic, gumbel, gumbel_l, gumbel_r
+		Parameters
+			ds: data set name or list or numpy array
+			dist: type of distribution
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Anderson test for for various distributions", ds)
+		diList = ["norm", "expon", "logistic", "gumbel", "gumbel_l", "gumbel_r", "extreme1"]
+		assert dist in diList, "invalid distribution"
+		data = self.getNumericData(ds)
+		re = sta.anderson(data)
+		slAlpha = int(100 * sigLev)
+		msg = "significnt value not found"
+		for i in range(len(re.critical_values)):
+			sl, cv = re.significance_level[i], re.critical_values[i]
+			if int(sl) == slAlpha:
+				if re.statistic < cv:
+					msg = "probably {} at the {:.3f} siginificance level".format(dist, sl)
+				else:
+					msg = "probably not {} at the {:.3f} siginificance level".format(dist, sl)
+		result = self.__printResult("stat", re.statistic, "test", msg)
+		print(msg)
+		return result
+	def testSkew(self, ds, sigLev=.05):
+		"""
+		test skew wrt  normal distr
+		Parameters
+			ds: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("testing skew wrt normal distr", ds)
+		data = self.getNumericData(ds)
+		stat, pvalue = sta.skewtest(data)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same skew as normal distribution", "probably not same skew as normal distribution", sigLev)
+		return result
+	def testTwoSampleStudent(self, ds1, ds2, sigLev=.05):
+		"""
+		student t 2 sample test
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing student t 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.ttest_ind(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+		return result
+	def testTwoSampleKs(self, ds1, ds2, sigLev=.05):
+		"""
+		Kolmogorov Sminov 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Kolmogorov Sminov 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.ks_2samp(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+	def testTwoSampleMw(self, ds1, ds2, sigLev=.05):
+		"""
+		Mann-Whitney  2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Mann-Whitney  2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.mannwhitneyu(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+	def testTwoSampleWilcox(self, ds1, ds2, sigLev=.05):
+		"""
+		Wilcoxon Signed-Rank 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Wilcoxon Signed-Rank 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.wilcoxon(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+	def testTwoSampleKw(self, ds1, ds2, sigLev=.05):
+		"""
+		Kruskal-Wallis 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Kruskal-Wallis 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.kruskal(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably snot ame distribution", sigLev)
+	def testTwoSampleFriedman(self, ds1, ds2, ds3, sigLev=.05):
+		"""
+		Friedman 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Friedman 2 sample  test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		data3 = self.getNumericData(ds3)
+		stat, pvalue = sta.friedmanchisquare(data1, data2, data3)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+	def testTwoSampleEs(self, ds1, ds2, sigLev=.05):
+		"""
+		Epps Singleton 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Epps Singleton 2 sample  test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.epps_singleton_2samp(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
+	def testTwoSampleAnderson(self, ds1, ds2, sigLev=.05):
+		"""
+		Anderson 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Anderson 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		dseq = (data1, data2)
+		stat, critValues, sLev = sta.anderson_ksamp(dseq)
+		slAlpha = 100 * sigLev
+		if slAlpha == 10:
+			cv = critValues[1]
+		elif slAlpha == 5:
+			cv = critValues[2]
+		elif slAlpha == 2.5:
+			cv = critValues[3]
+		elif slAlpha == 1:
+			cv = critValues[4]
+		else:
+			cv = None
+		result = self.__printResult("stat", stat, "critValues", critValues, "critValue", cv, "significanceLevel", sLev)
+		print("stat:   {:.3f}".format(stat))
+		if cv is None:
+			msg = "critical values value not found for provided siginificance level"
+		else:
+			if stat < cv:
+				msg = "probably same distribution at the {:.3f} siginificance level".format(sigLev)
+			else:
+				msg = "probably not same distribution at the {:.3f} siginificance level".format(sigLev)
+		print(msg)
+		return result
+	def testTwoSampleScaleAb(self, ds1, ds2, sigLev=.05):
+		"""
+		Ansari Bradley 2 sample scale statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Ansari Bradley 2 sample scale test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.ansari(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same scale", "probably not same scale", sigLev)
+		return result
+	def testTwoSampleScaleMood(self, ds1, ds2, sigLev=.05):
+		"""
+		Mood 2 sample scale statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Mood 2 sample scale test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.mood(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same scale", "probably not same scale", sigLev)
+		return result
+	def testTwoSampleVarBartlet(self, ds1, ds2, sigLev=.05):
+		"""
+		Ansari Bradley 2 sample scale statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Ansari Bradley 2 sample scale test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.bartlett(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
+		return result
+	def testTwoSampleVarLevene(self, ds1, ds2, sigLev=.05):
+		"""
+		Levene 2 sample variance statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Levene 2 sample variance test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.levene(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
+		return result
+	def testTwoSampleVarFk(self, ds1, ds2, sigLev=.05):
+		"""
+		Fligner-Killeen 2 sample variance statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Fligner-Killeen 2 sample variance test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue = sta.fligner(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue)
+		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
+		return result
+	def testTwoSampleMedMood(self, ds1, ds2, sigLev=.05):
+		"""
+		Mood 2 sample median statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Mood 2 sample median test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat, pvalue, median, ctable = sta.median_test(data1, data2)
+		result = self.__printResult("stat", stat, "pvalue", pvalue, "median", median, "contigencyTable", ctable)
+		self.__printStat(stat, pvalue, "probably same median", "probably not same median", sigLev)
+		return result
+	def testTwoSampleZc(self, ds1, ds2, sigLev=.05):
+		"""
+		Zhang-C 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Zhang-C 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		l1 = len(data1)
+		l2 = len(data2)
+		l = l1 + l2
+		#find ranks
+		pooled = np.concatenate([data1, data2])
+		ranks = findRanks(data1, pooled)
+		ranks.extend(findRanks(data2, pooled))
+		s1 = 0.0
+		for i in range(1, l1+1):
+			s1 += math.log(l1 / (i - 0.5) - 1.0) * math.log(l / (ranks[i-1] - 0.5) - 1.0)
+		s2 = 0.0
+		for i in range(1, l2+1):
+			s2 += math.log(l2 / (i - 0.5) - 1.0) * math.log(l / (ranks[l1 + i - 1] - 0.5) - 1.0)
+		stat = (s1 + s2) / l
+		print(formatFloat(3, stat, "stat:"))
+		return stat
+	def testTwoSampleZa(self, ds1, ds2, sigLev=.05):
+		"""
+		Zhang-A 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Zhang-A 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		l1 = len(data1)
+		l2 = len(data2)
+		l = l1 + l2
+		pooled = np.concatenate([data1, data2])
+		cd1 = CumDistr(data1)
+		cd2 = CumDistr(data2)
+		sum = 0.0
+		for i in range(1, l+1):
+			v = pooled[i-1]
+			f1 = cd1.getDistr(v)
+			f2 = cd2.getDistr(v)
+			t1 = f1 * math.log(f1)
+			t2 = 0 if f1 == 1.0 else (1.0 - f1) * math.log(1.0 - f1)
+			sum += l1 * (t1 + t2) / ((i - 0.5) * (l - i + 0.5))
+			t1 = f2 * math.log(f2)
+			t2 = 0 if f2 == 1.0 else (1.0 - f2) * math.log(1.0 - f2)
+			sum += l2 * (t1 + t2) / ((i - 0.5) * (l - i + 0.5))
+		stat = -sum
+		print(formatFloat(3, stat, "stat:"))
+		return stat
+	def testTwoSampleZk(self, ds1, ds2, sigLev=.05):
+		"""
+		Zhang-K 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing Zhang-K 2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		l1 = len(data1)
+		l2 = len(data2)
+		l = l1 + l2
+		pooled = np.concatenate([data1, data2])
+		cd1 = CumDistr(data1)
+		cd2 = CumDistr(data2)
+		cd = CumDistr(pooled)
+		maxStat = None
+		for i in range(1, l+1):
+			v = pooled[i-1]
+			f1 = cd1.getDistr(v)
+			f2 = cd2.getDistr(v)
+			f = cd.getDistr(v)
+			t1 = 0 if f1 == 0 else f1 * math.log(f1 / f)
+			t2 = 0 if f1 == 1.0 else (1.0 - f1) * math.log((1.0 - f1) / (1.0 - f))
+			stat = l1 * (t1 + t2)
+			t1 = 0 if f2 == 0 else f2 * math.log(f2 / f)
+			t2 = 0 if f2 == 1.0 else (1.0 - f2) * math.log((1.0 - f2) / (1.0 - f))
+			stat += l2 * (t1 + t2)
+			if maxStat is None or stat > maxStat:
+				maxStat = stat
+		print(formatFloat(3, maxStat, "stat:"))
+		return maxStat
+	def testTwoSampleCvm(self, ds1, ds2, sigLev=.05):
+		"""
+		2 sample cramer von mises
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+			sigLev: statistical significance level
+		"""
+		self.__printBanner("doing 2 sample CVM test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		data = np.concatenate((data1,data2))
+		rdata = sta.rankdata(data)
+		n = len(data1)
+		m = len(data2)
+		l = n + m
+		s1 = 0
+		for i in range(n):
+			t = rdata[i] - (i+1)
+			s1 += (t * t)
+		s1 *= n
+		s2 = 0
+		for i in range(m):
+			t = rdata[i + n] - (i+1)
+			s2 += (t * t)
+		s2 *= m
+		u = s1 + s2
+		stat = u / (n * m * l) - (4 * m * n - 1) / (6 * l)
+		result = self.__printResult("stat", stat)
+		return result
+	def ensureSameSize(self, dlist):
+		"""
+		ensures all data sets are of same size
+		Parameters
+			dlist : data source list
+		"""
+		le = None
+		for d in dlist:
+			cle = len(d)
+			if le is None:
+				le = cle
+			else:
+				assert cle == le, "all data sets need to be of same size"
+	def testTwoSampleWasserstein(self, ds1, ds2):
+		"""
+		Wasserstein 2 sample statistic
+		Parameters
+			ds1: data set name or list or numpy array
+			ds2: data set name or list or numpy array
+		"""
+		self.__printBanner("doing Wasserstein distance2 sample test", ds1, ds2)
+		data1 = self.getNumericData(ds1)
+		data2 = self.getNumericData(ds2)
+		stat = sta.wasserstein_distance(data1, data2)
+		sd = np.std(np.concatenate([data1, data2]))
+		nstat = stat / sd
+		result = self.__printResult("stat", stat, "normalizedStat", nstat)
+		return result
+	def getMaxRelMinRedFeatures(self, fdst, tdst, nfeatures, nbins=20):
+		"""
+		get top n features based on max relevance and min redudancy	algorithm
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nfeatures : desired no of features
+			nbins : no of bins for numerical data
+		"""
+		self.__printBanner("doing max relevance min redundancy feature selection")
+		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "mrmr", nbins)
+	def getJointMutInfoFeatures(self, fdst, tdst, nfeatures, nbins=20):
+		"""
+		get top n features based on joint mutual infoormation	algorithm
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nfeatures : desired no of features
+			nbins : no of bins for numerical data
+		"""
+		self.__printBanner("doingjoint mutual info feature selection")
+		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "jmi", nbins)
+	def getCondMutInfoMaxFeatures(self, fdst, tdst, nfeatures, nbins=20):
+		"""
+		get top n features based on condition mutual information maximization algorithm
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nfeatures : desired no of features
+			nbins : no of bins for numerical data
+		"""
+		self.__printBanner("doing conditional mutual info max feature selection")
+		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "cmim", nbins)
+	def getInteractCapFeatures(self, fdst, tdst, nfeatures, nbins=20):
+		"""
+		get top n features based on interaction capping algorithm
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nfeatures : desired no of features
+			nbins : no of bins for numerical data
+		"""
+		self.__printBanner("doing interaction capped feature selection")
+		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "icap", nbins)
+	def getMutInfoFeatures(self, fdst, tdst, nfeatures, algo, nbins=20):
+		"""
+		get top n features based on various mutual information	based algorithm
+		ref: Conditional likelihood maximisation : A unifying framework for information
+		theoretic feature selection, Gavin Brown
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nfeatures : desired no of features
+			algo: mi based feature selection algorithm
+			nbins : no of bins for numerical data
+		"""
+		#verify data source types types
+		le = len(fdst)
+		nfeatGiven = int(le / 2)
+		assertGreater(nfeatGiven, nfeatures, "no of features should be greater than no of features to be selected")
+		fds = list()
+		types = ["num", "cat"]
+		for i in range (0, le, 2):
+			ds = fdst[i]
+			dt = fdst[i+1]
+			assertInList(dt, types, "invalid type for data source " + dt)
+			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
+			p =(ds, dt)
+			fds.append(p)
+		algos = ["mrmr", "jmi", "cmim", "icap"]
+		assertInList(algo, algos, "invalid feature selection algo " + algo)
+		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
+		data = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
+		#print(fds)
+		sfds = list()
+		selected = set()
+		relevancies = dict()
+		for i in range(nfeatures):
+			#print(i)
+			scorem = None
+			dsm = None
+			dsmt = None
+			for ds, dt in fds:
+				#print(ds, dt)
+				if ds not in selected:
+					#relevancy
+					if ds in relevancies:
+						mutInfo = relevancies[ds]
+					else:
+						mutInfo = self.getMutualInfo([ds, dt,  tdst[0], tdst[1]], nbins)["mutInfo"]
+						relevancies[ds] = mutInfo
+					relev = mutInfo
+					#print("relev", relev)
+					#redundancy
+					smi = 0
+					reds = list()
+					for sds, sdt, _ in sfds:
+						#print(sds, sdt)
+						mutInfo = self.getMutualInfo([ds, dt,  sds, sdt], nbins)["mutInfo"]
+						mutInfoCnd = self.getCondMutualInfo([ds, dt,  sds, sdt, tdst[0], tdst[1]], nbins)["condMutInfo"] \
+						if algo != "mrmr" else 0
+						red = mutInfo - mutInfoCnd
+						reds.append(red)
+					if algo == "mrmr" or algo == "jmi":
+						redun = sum(reds) / len(sfds) if len(sfds) > 0 else 0
+					elif algo == "cmim" or algo == "icap":
+						redun = max(reds) if len(sfds) > 0 else 0
+						if algo == "icap":
+							redun = max(0, redun)
+					#print("redun", redun)
+					score = relev - redun
+					if scorem is None or score > scorem:
+						scorem = score
+						dsm = ds
+						dsmt = dt
+			pa = (dsm, dsmt, scorem)
+			#print(pa)
+			sfds.append(pa)
+			selected.add(dsm)
+		selFeatures = list(map(lambda r : (r[0], r[2]), sfds))
+		result = self.__printResult("selFeatures", selFeatures)
+		return result
+	def getFastCorrFeatures(self, fdst, tdst, delta, nbins=20):
+		"""
+		get top features based on Fast Correlation Based Filter (FCBF)
+		ref: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution
+		Lei Yu
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			delta : feature, target correlation threshold
+			nbins : no of bins for numerical data
+		"""
+		le = len(fdst)
+		nfeatGiven = int(le / 2)
+		fds = list()
+		types = ["num", "cat"]
+		for i in range (0, le, 2):
+			ds = fdst[i]
+			dt = fdst[i+1]
+			assertInList(dt, types, "invalid type for data source " + dt)
+			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
+			p =(ds, dt)
+			fds.append(p)
+		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
+		data = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
+		# get features with symetric uncertainty above threshold
+		tentr = self.getAnyEntropy(tdst[0], tdst[1], nbins)["entropy"]
+		rfeatures = list()
+		fentrs = dict()
+		for ds, dt in fds:
+			mutInfo = self.getMutualInfo([ds, dt,  tdst[0], tdst[1]], nbins)["mutInfo"]
+			fentr = self.getAnyEntropy(ds, dt, nbins)["entropy"]
+			sunc = 2 * mutInfo / (tentr + fentr)
+			#print("ds {}  sunc {:.3f}".format(ds, sunc))
+			if sunc >= delta:
+				f = [ds, dt, sunc, False]
+				rfeatures.append(f)
+				fentrs[ds] = fentr
+		# sort descending of sym uncertainty
+		rfeatures.sort(key=lambda e : e[2], reverse=True)
+		#disccard redundant features
+		le = len(rfeatures)
+		for i in range(le):
+			if rfeatures[i][3]:
+				continue
+			for j in range(i+1, le, 1):
+				if rfeatures[j][3]:
+					continue
+				mutInfo = self.getMutualInfo([rfeatures[i][0], rfeatures[i][1],  rfeatures[j][0], rfeatures[j][1]], nbins)["mutInfo"]
+				sunc  = 2 * mutInfo / (fentrs[rfeatures[i][0]] + fentrs[rfeatures[j][0]])
+				if sunc >= rfeatures[j][2]:
+					rfeatures[j][3] = True
+		frfeatures = list(filter(lambda f : not f[3], rfeatures))
+		selFeatures = list(map(lambda f : [f[0], f[2]], frfeatures))
+		result = self.__printResult("selFeatures", selFeatures)
+		return result
+	def getInfoGainFeatures(self, fdst, tdst, nfeatures, nsplit, nbins=20):
+		"""
+		get top n features based on information gain or entropy loss
+		Parameters
+			fdst: list of pair of data set name or list or numpy array and data type
+			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
+			nsplit : num of splits
+			nfeatures : desired no of features
+			nbins : no of bins for numerical data
+		"""
+		le = len(fdst)
+		nfeatGiven = int(le / 2)
+		assertGreater(nfeatGiven, nfeatures, "available features should be greater than desired")
+		fds = list()
+		types = ["num", "cat"]
+		for i in range (0, le, 2):
+			ds = fdst[i]
+			dt = fdst[i+1]
+			assertInList(dt, types, "invalid type for data source " + dt)
+			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
+			p =(ds, dt)
+			fds.append(p)
+		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
+		assertGreater(nsplit, 3, "minimum 4 splits necessary")
+		tdata = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
+		tentr = self.getAnyEntropy(tdst[0], tdst[1], nbins)["entropy"]
+		sz =len(tdata)
+		sfds = list()
+		for ds, dt in fds:
+			#print(ds, dt)
+			if dt == "num":
+				fd = self.getNumericData(ds)
+				_ , _ , vmax, vmin = self.__getBasicStats(fd)
+				intv = (vmax - vmin) / nsplit
+				maxig = None
+				spmin = vmin + intv
+				spmax = vmax - 0.9 * intv
+				#iterate all splits
+				for sp in np.arange(spmin, spmax, intv):
+					ltvals = list()
+					gevals = list()
+					for i in range(len(fd)):
+						if fd[i] < sp:
+							ltvals.append(tdata[i])
+						else:
+							gevals.append(tdata[i])
+					self.addListNumericData(ltvals, "spds") if tdst[1] == "num" else self.addListCatData(ltvals, "spds")
+					lten = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
+					self.addListNumericData(gevals, "spds") if tdst[1] == "num" else self.addListCatData(gevals, "spds")
+					geen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
+					#info gain
+					ig = tentr - (len(ltvals) * lten / sz + len(gevals) * geen / sz)
+					if maxig is None or ig > maxig:
+						maxig = ig
+				pa = (ds, maxig)
+				sfds.append(pa)
+			else:
+				fd = self.getCatData(ds)
+				fds = set(fd)
+				fdps = genPowerSet(fds)
+				maxig = None
+				#iterate all subsets
+				for s in fdps:
+					if len(s) == len(fds):
+						continue
+					invals = list()
+					exvals = list()
+					for i in range(len(fd)):
+						if fd[i] in s:
+							invals.append(tdata[i])
+						else:
+							exvals.append(tdata[i])
+					self.addListNumericData(invals, "spds") if tdst[1] == "num" else self.addListCatData(invals, "spds")
+					inen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
+					self.addListNumericData(exvals, "spds") if tdst[1] == "num" else self.addListCatData(exvals, "spds")
+					exen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
+					ig = tentr - (len(invals) * inen / sz + len(exvals) * exen / sz)
+					if maxig is None or ig > maxig:
+						maxig = ig
+				pa = (ds, maxig)
+				sfds.append(pa)
+		#sort of info gain
+		sfds.sort(key = lambda v : v[1], reverse = True)
+		result = self.__printResult("selFeatures", sfds[:nfeatures])
+		return result
+	def __stackData(self, *dsl):
+		"""
+		stacks collumd to create matrix
+		Parameters
+			dsl: data source list
+		"""
+		dlist = tuple(map(lambda ds : self.getNumericData(ds), dsl))
+		self.ensureSameSize(dlist)
+		dmat = np.column_stack(dlist)
+		return dmat
+	def __printBanner(self, msg, *dsl):
+		"""
+		print banner for any function
+		Parameters
+			msg: message
+			dsl: list of data set name or list or numpy array
+		"""
+		tags = list(map(lambda ds : ds if type(ds) == str else "annoynymous", dsl))
+		forData = " for data sets " if tags else ""
+		msg = msg + forData + " ".join(tags)
+		if self.verbose:
+			print("\n== " + msg + " ==")
+	def __printDone(self):
+		"""
+		print banner for any function
+		"""
+		if self.verbose:
+			print("done")
+	def __printStat(self, stat, pvalue, nhMsg, ahMsg, sigLev=.05):
+		"""
+		generic stat and pvalue output
+		Parameters
+			stat : stat value
+			pvalue : p value
+			nhMsg : null hypothesis violation message
+			ahMsg : null hypothesis  message
+			sigLev : significance level
+		"""
+		if self.verbose:
+			print("\ntest result:")
+			print("stat:   {:.3f}".format(stat))
+			print("pvalue: {:.3f}".format(pvalue))
+			print("significance level: {:.3f}".format(sigLev))
+			print(nhMsg if pvalue > sigLev else ahMsg)
+	def __printResult(self,  *values):
+		"""
+		print results
+		Parameters
+			values : flattened kay and value pairs
+		"""
+		result = dict()
+		assert len(values) % 2 == 0, "key value list should have even number of items"
+		for i in range(0, len(values), 2):
+			result[values[i]] = values[i+1]
+		if self.verbose:
+			print("result details:")
+			self.pp.pprint(result)
+		return result
+	def __getBasicStats(self, data):
+		"""
+		get mean and std dev
+		Parameters
+			data : numpy array
+		"""
+		mean = np.average(data)
+		sd = np.std(data)
+		r = (mean, sd, np.max(data), np.min(data))
+		return r

matumizi/mcsim.py ADDED Viewed

	@@ -0,0 +1,552 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import matplotlib
+import random
+import jprops
+import statistics
+from matplotlib import pyplot
+from .util import *
+from .mlutil import *
+from .sampler import *
+class MonteCarloSimulator(object):
+	"""
+	monte carlo simulator for intergation, various statistic for complex fumctions
+	"""
+	def __init__(self, numIter, callback, logFilePath, logLevName):
+		"""
+		constructor
+		Parameters
+			numIter :num of iterations
+			callback : call back method
+			logFilePath : log file path
+			logLevName : log level
+		"""
+		self.samplers = list()
+		self.numIter = numIter;
+		self.callback = callback
+		self.extraArgs = None
+		self.output = list()
+		self.sum = None
+		self.mean = None
+		self.sd = None
+		self.replSamplers = dict()
+		self.prSamples = None
+		self.logger = None
+		if logFilePath is not None:
+			self.logger = createLogger(__name__, logFilePath, logLevName)
+			self.logger.info("******** stating new  session of MonteCarloSimulator")
+	def registerBernoulliTrialSampler(self, pr):
+		"""
+		bernoulli trial sampler
+		Parameters
+			pr : probability
+		"""
+		self.samplers.append(BernoulliTrialSampler(pr))
+	def registerPoissonSampler(self, rateOccur, maxSamp):
+		"""
+		poisson sampler
+		Parameters
+			rateOccur : rate of occurence
+			maxSamp : max limit on no of samples
+		"""
+		self.samplers.append(PoissonSampler(rateOccur, maxSamp))
+	def registerUniformSampler(self, minv, maxv):
+		"""
+		uniform sampler
+		Parameters
+			minv : min value
+			maxv : max value
+		"""
+		self.samplers.append(UniformNumericSampler(minv, maxv))
+	def registerTriangularSampler(self, min, max, vertexValue, vertexPos=None):
+		"""
+		triangular sampler
+		Parameters
+			xmin : min  value
+			xmax : max  value
+			vertexValue : distr value at vertex
+			vertexPos : vertex pposition
+		"""
+		self.samplers.append(TriangularRejectSampler(min, max, vertexValue, vertexPos))
+	def registerGaussianSampler(self, mean, sd):
+		"""
+		gaussian sampler
+		Parameters
+			mean : mean
+			sd : std deviation
+		"""
+		self.samplers.append(GaussianRejectSampler(mean, sd))
+	def registerNormalSampler(self, mean, sd):
+		"""
+		gaussian sampler using numpy
+		Parameters
+			mean : mean
+			sd : std deviation
+		"""
+		self.samplers.append(NormalSampler(mean, sd))
+	def registerLogNormalSampler(self, mean, sd):
+		"""
+		log normal sampler using numpy
+		Parameters
+			mean : mean
+			sd : std deviation
+		"""
+		self.samplers.append(LogNormalSampler(mean, sd))
+	def registerParetoSampler(self, mode, shape):
+		"""
+		pareto sampler using numpy
+		Parameters
+			mode : mode
+			shape : shape
+		"""
+		self.samplers.append(ParetoSampler(mode, shape))
+	def registerGammaSampler(self, shape, scale):
+		"""
+		gamma sampler using numpy
+		Parameters
+			shape : shape
+			scale : scale
+		"""
+		self.samplers.append(GammaSampler(shape, scale))
+	def registerDiscreteRejectSampler(self, xmin, xmax, step, *values):
+		"""
+		disccrete int sampler
+		Parameters
+			xmin : min  value
+			xmax : max  value
+			step : discrete step
+			values : distr values
+		"""
+		self.samplers.append(DiscreteRejectSampler(xmin, xmax, step, *values))
+	def registerNonParametricSampler(self, minv, binWidth, *values):
+		"""
+		nonparametric sampler
+		Parameters
+			xmin : min  value
+			binWidth : bin width
+			values : distr values
+		"""
+		sampler = NonParamRejectSampler(minv, binWidth, *values)
+		sampler.sampleAsFloat()
+		self.samplers.append(sampler)
+	def registerMultiVarNormalSampler(self,  numVar, *values):
+		"""
+		multi var gaussian sampler using numpy
+		Parameters
+			numVar : no of variables
+			values : numVar mean values followed by numVar x numVar values for covar matrix
+		"""
+		self.samplers.append(MultiVarNormalSampler(numVar, *values))
+	def registerJointNonParamRejectSampler(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):
+		"""
+		joint nonparametric sampler
+		Parameters
+			xmin : min  value for x
+			xbinWidth : bin width for x
+			xnbin : no of bins for x
+			ymin : min  value for y
+			ybinWidth : bin width for y
+			ynbin : no of bins for y
+			values : distr values
+		"""
+		self.samplers.append(JointNonParamRejectSampler(xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values))
+	def registerRangePermutationSampler(self, minv, maxv, *numShuffles):
+		"""
+		permutation sampler with range
+		Parameters
+			minv : min of range
+			maxv : max of range
+			numShuffles : no of shuffles or range of no of shuffles
+		"""
+		self.samplers.append(PermutationSampler.createSamplerWithRange(minv, maxv, *numShuffles))
+	def registerValuesPermutationSampler(self, values, *numShuffles):
+		"""
+		permutation sampler with values
+		Parameters
+			values : list data
+			numShuffles : no of shuffles or range of no of shuffles
+		"""
+		self.samplers.append(PermutationSampler.createSamplerWithValues(values, *numShuffles))
+	def registerNormalSamplerWithTrendCycle(self, mean, stdDev, trend, cycle,  step=1):
+		"""
+		normal sampler with trend and cycle
+		Parameters
+			mean : mean
+			stdDev : std deviation
+			dmean : trend delta
+			cycle : cycle values wrt base mean
+			step : adjustment step for cycle and trend
+		"""
+		self.samplers.append(NormalSamplerWithTrendCycle(mean, stdDev, trend, cycle,  step))
+	def registerCustomSampler(self, sampler):
+		"""
+		eventsampler
+		Parameters
+			sampler : sampler with sample() method
+		"""
+		self.samplers.append(sampler)
+	def registerEventSampler(self, intvSampler, valSampler=None):
+		"""
+		event sampler
+		Parameters
+			intvSampler : interval sampler
+			valSampler : value sampler
+		"""
+		self.samplers.append(EventSampler(intvSampler, valSampler))
+	def registerMetropolitanSampler(self, propStdDev, minv, binWidth, values):
+		"""
+		metropolitan sampler
+		Parameters
+			propStdDev : proposal distr std dev
+			minv : min domain value for target distr
+			binWidth : bin width
+			values : target distr values
+		"""
+		self.samplers.append(MetropolitanSampler(propStdDev, minv, binWidth, values))
+	def setSampler(self, var, iter, sampler):
+		"""
+		set sampler for some variable when iteration reaches certain point
+		Parameters
+			var : sampler index
+			iter : iteration count
+			sampler : new sampler
+		"""
+		key = (var, iter)
+		self.replSamplers[key] = sampler
+	def registerExtraArgs(self, *args):
+		"""
+		extra args
+		Parameters
+			args : extra argument list
+		"""
+		self.extraArgs = args
+	def replSampler(self, iter):
+		"""
+		replace samper for this iteration
+		Parameters
+			iter : iteration number
+		"""
+		if len(self.replSamplers) > 0:
+			for v in range(self.numVars):
+				key = (v, iter)
+				if key in self.replSamplers:
+					sampler = self.replSamplers[key]
+					self.samplers[v] = sampler
+	def run(self):
+		"""
+		run simulator
+		"""
+		self.sum = None
+		self.mean = None
+		self.sd = None
+		self.numVars = len(self.samplers)
+		vOut = 0
+		#print(formatAny(self.numIter, "num iterations"))
+		for i in range(self.numIter):
+			self.replSampler(i)
+			args = list()
+			for s in self.samplers:
+				arg = s.sample()
+				if type(arg) is list:
+					args.extend(arg)
+				else:
+					args.append(arg)
+			slen = len(args)
+			if self.extraArgs:
+				args.extend(self.extraArgs)
+			args.append(self)
+			args.append(i)
+			vOut = self.callback(args)
+			self.output.append(vOut)
+			self.prSamples = args[:slen]
+	def getOutput(self):
+		"""
+		get raw output
+		"""
+		return self.output
+	def setOutput(self, values):
+		"""
+		set raw output
+		Parameters
+			values : output values
+		"""
+		self.output = values
+		self.numIter = len(values)
+	def drawHist(self, myTitle, myXlabel, myYlabel):
+		"""
+		draw histogram
+		Parameters
+			myTitle : title
+			myXlabel : label for x
+			myYlabel : label for y
+		"""
+		pyplot.hist(self.output, density=True)
+		pyplot.title(myTitle)
+		pyplot.xlabel(myXlabel)
+		pyplot.ylabel(myYlabel)
+		pyplot.show()
+	def getSum(self):
+		"""
+		get sum
+		"""
+		if not self.sum:
+			self.sum = sum(self.output)
+		return self.sum
+	def getMean(self):
+		"""
+		get average
+		"""
+		if self.mean is None:
+			self.mean = statistics.mean(self.output)
+		return self.mean
+	def getStdDev(self):
+		"""
+		get std dev
+		"""
+		if self.sd is None:
+			self.sd = statistics.stdev(self.output, xbar=self.mean) if self.mean else statistics.stdev(self.output)
+		return self.sd
+	def getMedian(self):
+		"""
+		get average
+		"""
+		med = statistics.median(self.output)
+		return med
+	def getMax(self):
+		"""
+		get max
+		"""
+		return max(self.output)
+	def getMin(self):
+		"""
+		get min
+		"""
+		return min(self.output)
+	def getIntegral(self, bounds):
+		"""
+		integral
+		Parameters
+			bounds :  bound on sum
+		"""
+		if not self.sum:
+			self.sum = sum(self.output)
+		return self.sum * bounds / self.numIter
+	def getLowerTailStat(self, zvalue, numIntPoints=50):
+		"""
+		get lower tail stat
+		Parameters
+			zvalue : zscore upper bound
+			numIntPoints : no of interpolation point for cum distribution
+		"""
+		mean = self.getMean()
+		sd = self.getStdDev()
+		tailStart = self.getMin()
+		tailEnd = mean - zvalue * sd
+		cvaCounts = self.cumDistr(tailStart, tailEnd, numIntPoints)
+		reqConf = floatRange(0.0, 0.150, .01)
+		msg = "p value outside interpolation range, reduce zvalue and try again {:.5f}  {:.5f}".format(reqConf[-1], cvaCounts[-1][1])
+		assert reqConf[-1] < cvaCounts[-1][1], msg
+		critValues = self.interpolateCritValues(reqConf, cvaCounts, True, tailStart, tailEnd)
+		return critValues
+	def getPercentile(self, cvalue):
+		"""
+		percentile
+		Parameters
+			cvalue : value for percentile
+		"""
+		count = 0
+		for v in self.output:
+			if v < cvalue:
+				count += 1
+		percent =  int(count * 100.0 / self.numIter)
+		return percent
+	def getCritValue(self, pvalue):
+		"""
+		critical value for probabaility threshold
+		Parameters
+			pvalue : pvalue
+		"""
+		assertWithinRange(pvalue, 0.0, 1.0, "invalid probabaility value")
+		svalues = self.output.sorted()
+		ppval = None
+		cpval = None
+		intv = 1.0 / (self.numIter - 1)
+		for i in range(self.numIter - 1):
+			cpval = (i + 1) / self.numIter
+			if cpval > pvalue:
+				sl = svalues[i] - svalues[i-1]
+				cval = svalues[i-1] + sl * (pvalue - ppval)
+				break
+			ppval = cpval
+		return cval
+	def getUpperTailStat(self, zvalue, numIntPoints=50):
+		"""
+		upper tail stat
+		Parameters
+			zvalue : zscore upper bound
+			numIntPoints : no of interpolation point for cum distribution
+		"""
+		mean = self.getMean()
+		sd = self.getStdDev()
+		tailStart = mean + zvalue * sd
+		tailEnd = self.getMax()
+		cvaCounts = self.cumDistr(tailStart, tailEnd, numIntPoints)
+		reqConf = floatRange(0.85, 1.0, .01)
+		msg = "p value outside interpolation range, reduce zvalue and try again {:.5f}  {:.5f}".format(reqConf[0], cvaCounts[0][1])
+		assert reqConf[0] > cvaCounts[0][1],  msg
+		critValues = self.interpolateCritValues(reqConf, cvaCounts, False, tailStart, tailEnd)
+		return critValues
+	def cumDistr(self, tailStart, tailEnd, numIntPoints):
+		"""
+		cumulative distribution at tail
+		Parameters
+			tailStart : tail start
+			tailEnd : tail end
+			numIntPoints : no of interpolation points
+		"""
+		delta = (tailEnd - tailStart) / numIntPoints
+		cvalues = floatRange(tailStart, tailEnd, delta)
+		cvaCounts = list()
+		for cv in cvalues:
+			count = 0
+			for v in self.output:
+				if v < cv:
+					count += 1
+			p = (cv, count/self.numIter)
+			if self.logger is not None:
+				self.logger.info("{:.3f}  {:.3f}".format(p[0], p[1]))
+			cvaCounts.append(p)
+		return cvaCounts
+	def interpolateCritValues(self, reqConf, cvaCounts, lowertTail, tailStart, tailEnd):
+		"""
+		interpolate for spefici confidence limits
+		Parameters
+			reqConf : confidence level values
+			cvaCounts : cum values
+			lowertTail : True if lower tail
+			tailStart ; tail start
+			tailEnd : tail end
+		"""
+		critValues = list()
+		if self.logger is not None:
+			self.logger.info("target conf limit " + str(reqConf))
+		reqConfSub = reqConf[1:] if lowertTail else reqConf[:-1]
+		for rc in reqConfSub:
+			for i in range(len(cvaCounts) -1):
+				if rc >= cvaCounts[i][1] and rc < cvaCounts[i+1][1]:
+					#print("interpoltate between " + str(cvaCounts[i])  +  " and " + str(cvaCounts[i+1]))
+					slope = (cvaCounts[i+1][0] - cvaCounts[i][0]) / (cvaCounts[i+1][1] - cvaCounts[i][1])
+					cval = cvaCounts[i][0] + slope * (rc - cvaCounts[i][1])
+					p = (rc, cval)
+					if self.logger is not None:
+						self.logger.debug("interpolated crit values {:.3f} {:.3f}".format(p[0], p[1]))
+					critValues.append(p)
+					break
+		if lowertTail:
+			p = (0.0, tailStart)
+			critValues.insert(0, p)
+		else:
+			p = (1.0, tailEnd)
+			critValues.append(p)
+		return critValues

matumizi/mlutil.py ADDED Viewed

	@@ -0,0 +1,1500 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Package imports
+import os
+import sys
+import numpy as np
+from sklearn import preprocessing
+from sklearn import metrics
+from sklearn.datasets import make_blobs
+from sklearn.datasets import make_classification
+import random
+from math import *
+from decimal import Decimal
+import statistics
+import jprops
+from Levenshtein import distance as ld
+from .util import *
+from .sampler import *
+class Configuration:
+	"""
+	Configuration management. Supports default value, mandatory value and typed value.
+	"""
+	def __init__(self, configFile, defValues, verbose=False):
+		"""
+		initializer
+		Parameters
+			configFile : config file path
+			defValues : dictionary of default values
+			verbose : verbosity flag
+		"""
+		configs = {}
+		with open(configFile) as fp:
+  			for key, value in jprops.iter_properties(fp):
+  				configs[key] = value
+		self.configs = configs
+		self.defValues = defValues
+		self.verbose = verbose
+	def override(self, configFile):
+		"""
+		over ride configuration from file
+		Parameters
+			configFile : override config file path
+		"""
+		with open(configFile) as fp:
+  			for key, value in jprops.iter_properties(fp):
+  				self.configs[key] = value
+	def setParam(self, name, value):
+		"""
+		override individual configuration
+		Parameters
+			name : config param name
+			value : config param value
+		"""
+		self.configs[name] = value
+	def getStringConfig(self, name):
+		"""
+		get string param
+		Parameters
+			name : config param name
+		"""
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			val = (self.configs[name], False)
+		if self.verbose:
+			print( "{} {} {}".format(name, self.configs[name], val[0]))
+		return val
+	def getIntConfig(self, name):
+		"""
+		get int param
+		Parameters
+			name : config param name
+		"""
+		#print "%s %s" %(name,self.configs[name])
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			val = (int(self.configs[name]), False)
+		if self.verbose:
+			print( "{} {} {}".format(name, self.configs[name], val[0]))
+		return val
+	def getFloatConfig(self, name):
+		"""
+		get float param
+		Parameters
+			name : config param name
+		"""
+		#print "%s %s" %(name,self.configs[name])
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			val = (float(self.configs[name]), False)
+		if self.verbose:
+			print( "{} {} {:06.3f}".format(name, self.configs[name], val[0]))
+		return val
+	def getBooleanConfig(self, name):
+		"""
+		#get boolean param
+		Parameters
+			name : config param name
+		"""
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			bVal = self.configs[name].lower() == "true"
+			val = (bVal, False)
+		if self.verbose:
+			print( "{} {} {}".format(name, self.configs[name], val[0]))
+		return val
+	def getIntListConfig(self, name, delim=","):
+		"""
+		get int list param
+		Parameters
+			name : config param name
+			delim : delemeter
+		"""
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			delSepStr = self.getStringConfig(name)
+			#specified as range
+			intList = strListOrRangeToIntArray(delSepStr[0])
+			val =(intList, delSepStr[1])
+		return val
+	def getFloatListConfig(self, name, delim=","):
+		"""
+		get float list param
+		Parameters
+			name : config param name
+			delim : delemeter
+		"""
+		delSepStr = self.getStringConfig(name)
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			flList = strToFloatArray(delSepStr[0], delim)
+			val =(flList, delSepStr[1])
+		return val
+	def getStringListConfig(self, name, delim=","):
+		"""
+		get string list param
+		Parameters
+			name : config param name
+			delim : delemeter
+		"""
+		delSepStr = self.getStringConfig(name)
+		if self.isNone(name):
+			val = (None, False)
+		elif self.isDefault(name):
+			val = (self.handleDefault(name), True)
+		else:
+			strList = delSepStr[0].split(delim)
+			val = (strList, delSepStr[1])
+		return val
+	def handleDefault(self, name):
+		"""
+		handles default
+		Parameters
+			name : config param name
+		"""
+		dVal = self.defValues[name]
+		if (dVal[1] is None):
+			val = dVal[0]
+		else:
+			raise ValueError(dVal[1])
+		return val
+	def isNone(self, name):
+		"""
+		true is value is None
+		Parameters
+			name : config param name
+		"""
+		return self.configs[name].lower() == "none"
+	def isDefault(self, name):
+		"""
+		true if the value is default
+		Parameters
+			name : config param name
+		"""
+		de = self.configs[name] == "_"
+		#print de
+		return de
+	def eitherOrStringConfig(self, firstName, secondName):
+		"""
+		returns one of two string parameters
+		Parameters
+			firstName : first parameter name
+			secondName : second parameter name
+		"""
+		if not self.isNone(firstName):
+			first = self.getStringConfig(firstName)[0]
+			second = None
+			if not self.isNone(secondName):
+				raise ValueError("only one of the two parameters should be set and not both " + firstName + "  " + secondName)
+		else:
+			if not self.isNone(secondName):
+				second = self.getStringConfig(secondtName)[0]
+				first = None
+			else:
+				raise ValueError("at least one of the two parameters should be set " + firstName + "  " + secondName)
+		return (first, second)
+	def eitherOrIntConfig(self, firstName, secondName):
+		"""
+		returns one of two int parameters
+		Parameters
+			firstName : first parameter name
+			secondName : second parameter name
+		"""
+		if not self.isNone(firstName):
+			first = self.getIntConfig(firstName)[0]
+			second = None
+			if not self.isNone(secondName):
+				raise ValueError("only one of the two parameters should be set and not both " + firstName + "  " + secondName)
+		else:
+			if not self.isNone(secondName):
+				second = self.getIntConfig(secondsName)[0]
+				first = None
+			else:
+				raise ValueError("at least one of the two parameters should be set " + firstName + "  " + secondName)
+		return (first, second)
+class CatLabelGenerator:
+	"""
+	label generator for categorical variables
+	"""
+	def __init__(self,  catValues, delim):
+		"""
+		initilizers
+		Parameters
+			catValues : dictionary of categorical values
+			delim : delemeter
+		"""
+		self.encoders = {}
+		self.catValues = catValues
+		self.delim = delim
+		for k in self.catValues.keys():
+			le = preprocessing.LabelEncoder()
+			le.fit(self.catValues[k])
+			self.encoders[k] = le
+	def processRow(self, row):
+		"""
+		encode row categorical values
+		Parameters:
+			row : data row
+		"""
+		#print row
+		rowArr = row.split(self.delim)
+		for i in range(len(rowArr)):
+			if (i in self.catValues):
+				curVal = rowArr[i]
+				assert curVal in self.catValues[i], "categorival value invalid"
+				encVal = self.encoders[i].transform([curVal])
+				rowArr[i] = str(encVal[0])
+		return self.delim.join(rowArr)
+	def getOrigLabels(self, indx):
+		"""
+		get original labels
+		Parameters:
+			indx : column index
+		"""
+		return self.encoders[indx].classes_
+class SupvLearningDataGenerator:
+	"""
+	data generator for supervised learning
+	"""
+	def __init__(self,  configFile):
+		"""
+		initilizers
+		Parameters
+			configFile : config file path
+		"""
+		defValues = dict()
+		defValues["common.num.samp"] = (100, None)
+		defValues["common.num.feat"] = (5, None)
+		defValues["common.feat.trans"] = (None, None)
+		defValues["common.feat.types"] = (None, "missing feature types")
+		defValues["common.cat.feat.distr"] = (None, None)
+		defValues["common.output.precision"] = (3, None)
+		defValues["common.error"] = (0.01, None)
+		defValues["class.gen.technique"] = ("blob", None)
+		defValues["class.num.feat.informative"] = (2, None)
+		defValues["class.num.feat.redundant"] = (2, None)
+		defValues["class.num.feat.repeated"] = (0, None)
+		defValues["class.num.feat.cat"] = (0, None)
+		defValues["class.num.class"] = (2, None)
+		self.config = Configuration(configFile, defValues)
+	def genClassifierData(self):
+		"""
+		generates classifier data
+		"""
+		nsamp =  self.config.getIntConfig("common.num.samp")[0]
+		nfeat =  self.config.getIntConfig("common.num.feat")[0]
+		nclass =  self.config.getIntConfig("class.num.class")[0]
+		#transform with shift and scale
+		ftrans =  self.config.getFloatListConfig("common.feat.trans")[0]
+		feTrans = dict()
+		for i in range(0, len(ftrans), 2):
+			tr = (ftrans[i], ftrans[i+1])
+			indx = int(i/2)
+			feTrans[indx] = tr
+		ftypes =  self.config.getStringListConfig("common.feat.types")[0]
+		# categorical feature distribution
+		feCatDist = dict()
+		fcatdl =  self.config.getStringListConfig("common.cat.feat.distr")[0]
+		for fcatds in fcatdl:
+			fcatd = fcatds.split(":")
+			feInd =  int(fcatd[0])
+			clVal =  int(fcatd[1])
+			key = (feInd, clVal)		#feature index and class value
+			dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))
+			feCatDist[key] = CategoricalRejectSampler(*dist)
+		#shift and scale
+		genTechnique = self.config.getStringConfig("class.gen.technique")[0]
+		error = self.config.getFloatConfig("common.error")[0]
+		if genTechnique == "blob":
+			features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)
+			for i in range(nsamp):			#shift and scale
+				for j in range(nfeat):
+					tr = feTrans[j]
+					features[i,j] = (features[i,j]  + tr[0]) * tr[1]
+			claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))
+		elif genTechnique == "classify":
+			nfeatInfo =  self.config.getIntConfig("class.num.feat.informative")[0]
+			nfeatRed =  self.config.getIntConfig("class.num.feat.redundant")[0]
+			nfeatRep =  self.config.getIntConfig("class.num.feat.repeated")[0]
+			shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))
+			scales = list(map(lambda i : feTrans[i][1], range(nfeat)))
+			features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed,
+			n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)
+		else:
+			raise "invalid genaration technique"
+		# add categorical features and format
+		nCatFeat = self.config.getIntConfig("class.num.feat.cat")[0]
+		prec =  self.config.getIntConfig("common.output.precision")[0]
+		for f , c in zip(features, claz):
+			nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))
+			if nCatFeat > 0:
+				cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))
+				rec = ",".join(nfs) + "," +  ",".join(cfs)  + "," + str(c)
+			else:
+				rec = ",".join(nfs)  + "," + str(c)
+			yield rec
+	def numFeToStr(self, fv, ft, prec):
+		"""
+		nummeric feature value to string
+		Parameters
+			fv : field value
+			ft : field data type
+			prec : precision
+		"""
+		if ft == "float":
+			s = formatFloat(prec, fv)
+		elif ft =="int":
+			s = str(int(fv))
+		else:
+			raise "invalid type expecting float or int"
+		return s
+	def catFe(self, i, cv, ft, feCatDist):
+		"""
+		generate categorical feature
+		Parameters
+			i : col index
+			cv : class value
+			ft : field data type
+			feCatDist : cat value distribution
+		"""
+		if ft == "cat":
+			key = (i, cv)
+			s = feCatDist[key].sample()
+		else:
+			raise "invalid type expecting categorical"
+		return s
+class RegressionDataGenerator:
+	"""
+	data generator for regression, including square terms, cross terms, bias, noise, correlated variables
+	and user defined function
+	"""
+	def __init__(self,  configFile, callback=None):
+		"""
+		initilizers
+		Parameters
+			configFile : config file path
+			callback : user defined function
+		"""
+		defValues = dict()
+		defValues["common.pvar.samplers"] = (None, None)
+		defValues["common.pvar.ranges"] = (None, None)
+		defValues["common.linear.weights"] = (None, None)
+		defValues["common.square.weights"] = (None, None)
+		defValues["common.crterm.weights"] = (None, None)
+		defValues["common.corr.params"] = (None, None)
+		defValues["common.bias"] = (0, None)
+		defValues["common.noise"] = (None, None)
+		defValues["common.tvar.range"] = (None, None)
+		defValues["common.weight.niter"] = (20, None)
+		self.config = Configuration(configFile, defValues)
+		self.callback = callback
+		#samplers for predictor variables
+		items = self.config.getStringListConfig("common.pvar.samplers")[0]
+		self.samplers = list(map(lambda s : createSampler(s), items))
+		self.npvar = len(self.samplers)
+		#values range  for predictor variables
+		items = self.config.getStringListConfig("common.pvar.ranges")[0]
+		self.pvranges = list()
+		for i in range(0, len(items), 2):
+			if 	items[i] =="none":
+				r = None
+			else:
+				vmin = float(items[i])
+				vmax = float(items[i+1])
+				r = (vmin, vmax, vmax-vmin)
+			self.pvranges.append(r)
+		assertEqual(len(self.pvranges), self.npvar, "no of predicatble var ranges provided is inavalid")
+		#linear weights for predictor variables
+		self.lweights = self.config.getFloatListConfig("common.linear.weights")[0]
+		assertEqual(len(self.lweights), self.npvar, "no of linear weights provided is inavalid")
+		#square weights for predictor variables
+		items = self.config.getStringListConfig("common.square.weights")[0]
+		self.sqweight = dict()
+		for i in range(0, len(items), 2):
+			vi = int(items[i])
+			assertLesser(vi, self.npvar, "invalid predictor var index")
+			wt = float(items[i+1])
+			self.sqweight[vi] = wt
+		#crossterm weights for predictor variables
+		items = self.config.getStringListConfig("common.crterm.weights")[0]
+		self.crweight = dict()
+		for i in range(0, len(items), 3):
+			vi = int(items[i])
+			assertLesser(vi, self.npvar, "invalid predictor var index")
+			vj = int(items[i+1])
+			assertLesser(vj, self.npvar, "invalid predictor var index")
+			wt = float(items[i+2])
+			vp = (vi, vj)
+			self.crweight[vp] = wt
+		#correlated variables
+		items = self.config.getStringListConfig("common.corr.params")[0]
+		self.corrparams = dict()
+		for co in items:
+			cparam = co.split(":")
+			vi = int(cparam[0])
+			vj = int(cparam[1])
+			k = (vi,vj)
+			bias = float(cparam[2])
+			wt = float(cparam[3])
+			noise = float(cparam[4])
+			roundoff = cparam[5] == "true"
+			v = (bias, wt, noise, roundoff)
+			self.corrparams[k] = v
+		#boas, noise and target range values
+		self.bias = self.config.getFloatConfig("common.bias")[0]
+		noise = self.config.getStringListConfig("common.noise")[0]
+		self.ndistr = noise[0]
+		self.noise = float(noise[1])
+		self.tvarlim = self.config.getFloatListConfig("common.tvar.range")[0]
+		#sample
+		niter = self.config.getIntConfig("common.weight.niter")[0]
+		yvals = list()
+		for i in range(niter):
+			y = self.sample()[1]
+			yvals.append(y)
+		#scale weights by sampled mean and target mean
+		my = statistics.mean(yvals)
+		myt =(self.tvarlim[1] - self.tvarlim[0]) / 2
+		sc = (myt - self.bias) / (my - self.bias)
+		#print("weight scale {:.3f}".format(sc))
+		self.lweights = list(map(lambda w : w * sc, self.lweights))
+		#print("weights {}".format(toStrFromList(self.lweights, 3)))
+		for k in self.sqweight.keys():
+			self.sqweight[k] *= sc
+		for k in self.crweight.keys():
+			self.crweight[k] *= sc
+	def sample(self):
+		"""
+		sample predictor variables and target variable
+		"""
+		pvd = list(map(lambda s : s.sample(), self.samplers))
+		#correct for correlated variables
+		for k in self.corrparams.keys():
+			vi = k[0]
+			vj = k[1]
+			v = self.corrparams[k]
+			bias = v[0]
+			wt = v[1]
+			noise = v[2]
+			roundoff = v[3]
+			nv = bias + wt * pvd[vi]
+			pvd[vj] = preturbScalar(nv, noise, "normal")
+			if roundoff:
+				pvd[vj] = round(pvd[vj])
+		spvd = list()
+		lsum = self.bias
+		for i in range(self.npvar):
+			#range limit
+			if  self.pvranges[i] is not None:
+				pvd[i] = rangeLimit(pvd[i], self.pvranges[i][0], self.pvranges[i][1])
+			spvd.append(pvd[i])
+			#scale
+			pvd[i] = scaleMinMaxScaData(pvd[i], self.pvranges[i])
+			lsum += self.lweights[i] * pvd[i]
+		#square terms
+		ssum = 0
+		for k in self.sqweight.keys():
+			ssum += self.sqweight[k] + pvd[k] * pvd[k]
+		#cross terms
+		crsum = 0
+		for k in self.crweight.keys():
+			vi = k[0]
+			vj = k[1]
+			crsum += self.crweight[k] * pvd[vi] * pvd[vj]
+		y = lsum + ssum + crsum
+		y = preturbScalar(y, self.noise, self.ndistr)
+		if self.callback is not None:
+			ufy = self.callback(spvd)
+			y += ufy
+		r = (spvd, y)
+		return r
+def loadDataFile(file, delim, cols, colIndices):
+	"""
+	loads delim separated file and extracts columns
+	Parameters
+		file : file path
+		delim : delemeter
+		cols : columns to use from file
+		colIndices ; columns to extract
+	"""
+	data = np.loadtxt(file, delimiter=delim, usecols=cols)
+	extrData = data[:,colIndices]
+	return (data, extrData)
+def loadFeatDataFile(file, delim, cols):
+	"""
+	loads delim separated file and extracts columns
+	Parameters
+		file : file path
+		delim : delemeter
+		cols : columns to use from file
+	"""
+	data = np.loadtxt(file, delimiter=delim, usecols=cols)
+	return data
+def extrColumns(arr, columns):
+	"""
+	extracts columns
+	Parameters
+		arr : 2D array
+		columns : columns
+	"""
+	return arr[:, columns]
+def subSample(featData, clsData, subSampleRate, withReplacement):
+	"""
+	subsample feature and class label data
+	Parameters
+		featData : 2D array of feature data
+		clsData : arrray of class labels
+		subSampleRate : fraction to be sampled
+		withReplacement : true if sampling with replacement
+	"""
+	sampSize = int(featData.shape[0] * subSampleRate)
+	sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)
+	sampFeat = featData[sampledIndx]
+	sampCls = clsData[sampledIndx]
+	return(sampFeat, sampCls)
+def euclideanDistance(x,y):
+	"""
+	euclidean distance
+	Parameters
+		x : first vector
+		y : second fvector
+	"""
+	return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))
+def squareRooted(x):
+	"""
+	square root of sum square
+	Parameters
+		x : data vector
+	"""
+	return round(sqrt(sum([a*a for a in x])),3)
+def cosineSimilarity(x,y):
+	"""
+	cosine similarity
+	Parameters
+		x : first vector
+		y : second fvector
+	"""
+	numerator = sum(a*b for a,b in zip(x,y))
+	denominator = squareRooted(x) * squareRooted(y)
+	return round(numerator / float(denominator), 3)
+def cosineDistance(x,y):
+	"""
+	cosine distance
+	Parameters
+		x : first vector
+		y : second fvector
+	"""
+	return 1.0 - cosineSimilarity(x,y)
+def manhattanDistance(x,y):
+	"""
+	manhattan distance
+	Parameters
+		x : first vector
+		y : second fvector
+	"""
+	return sum(abs(a-b) for a,b in zip(x,y))
+def nthRoot(value, nRoot):
+	"""
+	nth root
+	Parameters
+		value : data value
+		nRoot : root
+	"""
+	rootValue = 1/float(nRoot)
+	return round (Decimal(value) ** Decimal(rootValue),3)
+def minkowskiDistance(x,y,pValue):
+	"""
+	minkowski distance
+	Parameters
+		x : first vector
+		y : second fvector
+		pValue : power factor
+	"""
+	return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)
+def jaccardSimilarityX(x,y):
+	"""
+	jaccard similarity
+	Parameters
+		x : first vector
+		y : second fvector
+	"""
+	intersectionCardinality = len(set.intersection(*[set(x), set(y)]))
+	unionCardinality = len(set.union(*[set(x), set(y)]))
+	return intersectionCardinality/float(unionCardinality)
+def jaccardSimilarity(x,y,wx=1.0,wy=1.0):
+	"""
+	jaccard similarity
+	Parameters
+		x : first vector
+		y : second fvector
+		wx : weight for x
+		wy : weight for y
+	"""
+	sx = set(x)
+	sy = set(y)
+	sxyInt = sx.intersection(sy)
+	intCardinality = len(sxyInt)
+	sxIntDiff = sx.difference(sxyInt)
+	syIntDiff = sy.difference(sxyInt)
+	unionCardinality = len(sx.union(sy))
+	return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))
+def levenshteinSimilarity(s1, s2):
+	"""
+	Levenshtein similarity for strings
+	Parameters
+		sx : first string
+		sy : second string
+	"""
+	assert type(s1) == str and type(s2) == str,  "Levenshtein similarity is for string only"
+	d = ld(s1,s2)
+	#print(d)
+	l = max(len(s1),len(s2))
+	d = 1.0 - min(d/l, 1.0)
+	return d
+def norm(values, po=2):
+	"""
+	norm
+	Parameters
+		values : list of values
+		po : power
+	"""
+	no = sum(list(map(lambda v: pow(v,po), values)))
+	no = pow(no,1.0/po)
+	return list(map(lambda v: v/no, values))
+def createOneHotVec(size, indx = -1):
+	"""
+	random one hot vector
+	Parameters
+		size : vector size
+		indx : one hot position
+	"""
+	vec = [0] * size
+	s = random.randint(0, size - 1) if indx < 0 else indx
+	vec[s] = 1
+	return vec
+def createAllOneHotVec(size):
+	"""
+	create all one hot vectors
+	Parameters
+		size : vector size and no of vectors
+	"""
+	vecs = list()
+	for i in range(size):
+		vec = [0] * size
+		vec[i] = 1
+		vecs.append(vec)
+	return vecs
+def blockShuffle(data, blockSize):
+	"""
+	block shuffle
+	Parameters
+		data : list data
+		blockSize : block size
+	"""
+	numBlock = int(len(data) / blockSize)
+	remain = len(data) % blockSize
+	numBlock +=  (1 if remain > 0 else 0)
+	shuffled = list()
+	for i in range(numBlock):
+		b = random.randint(0, numBlock-1)
+		beg = b * blockSize
+		if (b < numBlock-1):
+			end = beg + blockSize
+			shuffled.extend(data[beg:end])
+		else:
+			shuffled.extend(data[beg:])
+	return shuffled
+def shuffle(data, numShuffle):
+	"""
+	shuffle data by randonm swapping
+	Parameters
+		data : list data
+		numShuffle : no of pairwise swaps
+	"""
+	sz = len(data)
+	if numShuffle is None:
+		numShuffle = int(sz / 2)
+	for i in range(numShuffle):
+		fi = random.randint(0, sz -1)
+		se = random.randint(0, sz -1)
+		tmp = data[fi]
+		data[fi] = data[se]
+		data[se] = tmp
+def randomWalk(size, start, lowStep, highStep):
+	"""
+	random walk
+	Parameters
+		size : list data
+		start : initial position
+		lowStep : step min
+		highStep : step max
+	"""
+	cur = start
+	for i in range(size):
+		yield cur
+		cur += randomFloat(lowStep, highStep)
+def binaryEcodeCategorical(values, value):
+	"""
+	one hot binary encoding
+	Parameters
+		values : list of values
+		value : value to be replaced with 1
+	"""
+	size = len(values)
+	vec = [0] * size
+	for i in range(size):
+		if (values[i] == value):
+			vec[i] = 1
+	return vec
+def createLabeledSeq(inputData, tw):
+	"""
+	Creates feature, label pair from sequence data, where we have tw number of features followed by output
+	Parameters
+		values : list containing feature and label
+		tw : no of features
+	"""
+	features = list()
+	labels = list()
+	l = len(inputDta)
+	for i in range(l - tw):
+		trainSeq = inputData[i:i+tw]
+		trainLabel = inputData[i+tw]
+		features.append(trainSeq)
+		labels.append(trainLabel)
+	return (features, labels)
+def createLabeledSeq(filePath, delim, index, tw):
+	"""
+	Creates feature, label pair from 1D sequence data in file
+	Parameters
+		filePath : file path
+		delim : delemeter
+		index : column index
+		tw : no of features
+	"""
+	seqData = getFileColumnAsFloat(filePath, delim, index)
+	return createLabeledSeq(seqData, tw)
+def fromMultDimSeqToTabular(data, inpSize, seqLen):
+	"""
+	Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)
+	Parameters
+		data : 2D array
+		inpSize : each input size in sequence
+		seqLen : sequence length
+	"""
+	nrow = data.shape[0]
+	assert data.shape[1] == inpSize * seqLen, "invalid input size or sequence length"
+	return data.reshape(nrow * seqLen, inpSize)
+def fromTabularToMultDimSeq(data, inpSize, seqLen):
+	"""
+	Input shape (nrow * seqLen, inpSize)   output  shape (nrow, inpSize * seqLen)
+	Parameters
+		data : 2D array
+		inpSize : each input size in sequence
+		seqLen : sequence length
+	"""
+	nrow = int(data.shape[0] / seqLen)
+	assert data.shape[1] == inpSize, "invalid input size"
+	return data.reshape(nrow,  seqLen * inpSize)
+def difference(data, interval=1):
+	"""
+	takes difference in time series data
+	Parameters
+		data :list data
+		interval : interval for difference
+	"""
+	diff = list()
+	for i in range(interval, len(data)):
+		value = data[i] - data[i - interval]
+		diff.append(value)
+	return diff
+def normalizeMatrix(data, norm, axis=1):
+	"""
+	normalized each row of the matrix
+	Parameters
+		data : 2D data
+		nporm : normalization method
+		axis : row or column
+	"""
+	normalized = preprocessing.normalize(data,norm=norm, axis=axis)
+	return normalized
+def standardizeMatrix(data, axis=0):
+	"""
+	standardizes each column of the matrix with mean and std deviation
+	Parameters
+		data : 2D data
+		axis : row or column
+	"""
+	standardized = preprocessing.scale(data, axis=axis)
+	return standardized
+def asNumpyArray(data):
+	"""
+	converts to numpy array
+	Parameters
+		data  : array
+	"""
+	return np.array(data)
+def perfMetric(metric, yActual, yPred, clabels=None):
+	"""
+	predictive model accuracy metric
+	Parameters
+		metric : accuracy metric
+		yActual : actual values array
+		yPred : predicted values array
+		clabels : class labels
+	"""
+	if metric == "rsquare":
+		score = metrics.r2_score(yActual, yPred)
+	elif metric == "mae":
+		score = metrics.mean_absolute_error(yActual, yPred)
+	elif metric == "mse":
+		score = metrics.mean_squared_error(yActual, yPred)
+	elif metric == "acc":
+		yPred = np.rint(yPred)
+		score = metrics.accuracy_score(yActual, yPred)
+	elif metric == "mlAcc":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.accuracy_score(yActual, yPred)
+	elif metric == "prec":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.precision_score(yActual, yPred)
+	elif metric == "rec":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.recall_score(yActual, yPred)
+	elif metric == "fone":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.f1_score(yActual, yPred)
+	elif metric == "confm":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.confusion_matrix(yActual, yPred)
+	elif metric == "clarep":
+		yPred = np.argmax(yPred, axis=1)
+		score = metrics.classification_report(yActual, yPred)
+	elif metric == "bce":
+		if clabels is None:
+			clabels = [0, 1]
+		score = metrics.log_loss(yActual, yPred, labels=clabels)
+	elif metric == "ce":
+		assert clabels is not None, "labels must be provided"
+		score = metrics.log_loss(yActual, yPred, labels=clabels)
+	else:
+		exitWithMsg("invalid prediction performance metric " + metric)
+	return score
+def scaleData(data, method):
+	"""
+	scales feature data column wise
+	Parameters
+		data : 2D array
+		method : scaling method
+	"""
+	if method == "minmax":
+		scaler = preprocessing.MinMaxScaler()
+		data = scaler.fit_transform(data)
+	elif method == "zscale":
+		data = preprocessing.scale(data)
+	else:
+		raise ValueError("invalid scaling method")
+	return data
+def scaleDataWithParams(data, method, scParams):
+	"""
+	scales feature data column wise
+	Parameters
+		data : 2D array
+		method : scaling method
+		scParams : scaling parameters
+	"""
+	if method == "minmax":
+		data = scaleMinMaxTabData(data, scParams)
+	elif method == "zscale":
+		raise ValueError("invalid scaling method")
+	else:
+		raise ValueError("invalid scaling method")
+	return data
+def scaleMinMaxScaData(data, minMax):
+	"""
+	minmax scales scalar data
+	Parameters
+		data : scalar data
+		minMax : min, max and range for each column
+	"""
+	sd = (data - minMax[0]) / minMax[2]
+	return sd
+def scaleMinMaxTabData(tdata, minMax):
+	"""
+	for tabular scales feature data column wise using min max values for each field
+	Parameters
+		tdata : 2D array
+		minMax : min, max and range for each column
+	"""
+	stdata = list()
+	for r in tdata:
+		srdata = list()
+		for i, c in enumerate(r):
+			sd = (c - minMax[i][0]) / minMax[i][2]
+			srdata.append(sd)
+		stdata.append(srdata)
+	return stdata
+def scaleMinMax(rdata, minMax):
+	"""
+	scales feature data column wise using min max values for each field
+	Parameters
+		rdata : data array
+		minMax : min, max and range for each column
+	"""
+	srdata = list()
+	for i in range(len(rdata)):
+		d = rdata[i]
+		sd = (d - minMax[i][0]) / minMax[i][2]
+		srdata.append(sd)
+	return srdata
+def harmonicNum(n):
+	"""
+	harmonic number
+	Parameters
+		n : number
+	"""
+	h = 0
+	for i in range(1, n+1, 1):
+		h += 1.0 / i
+	return h
+def digammaFun(n):
+	"""
+	figamma function
+	Parameters
+		n : number
+	"""
+	#Euler Mascheroni constant
+	ec = 0.577216
+	return harmonicNum(n - 1) - ec
+def getDataPartitions(tdata, types, columns = None):
+	"""
+	partitions data with the given columns and random split point defined with predicates
+	Parameters
+		tdata : 2D array
+		types : data typers
+		columns : column indexes
+	"""
+	(dtypes, cvalues) = extractTypesFromString(types)
+	if columns is None:
+		ncol = len(data[0])
+		columns = list(range(ncol))
+	ncol = len(columns)
+	#print(columns)
+	# partition predicates
+	partitions = None
+	for c in columns:
+		#print(c)
+		dtype = dtypes[c]
+		pred = list()
+		if dtype == "int" or dtype == "float":
+			(vmin, vmax) = getColMinMax(tdata, c)
+			r = vmax - vmin
+			rmin = vmin + .2 * r
+			rmax = vmax - .2 * r
+			sp = randomFloat(rmin, rmax)
+			if dtype == "int":
+				sp = int(sp)
+			else:
+				sp = "{:.3f}".format(sp)
+				sp = float(sp)
+			pred.append([c, "LT", sp])
+			pred.append([c, "GE", sp])
+		elif dtype == "cat":
+			cv = cvalues[c]
+			card = len(cv)
+			if card < 3:
+				num = 1
+			else:
+				num = randomInt(1, card - 1)
+			sp = selectRandomSubListFromList(cv, num)
+			sp = " ".join(sp)
+			pred.append([c, "IN", sp])
+			pred.append([c, "NOTIN", sp])
+		#print(pred)
+		if partitions is None:
+			partitions = pred.copy()
+			#print("initial")
+			#print(partitions)
+		else:
+			#print("extension")
+			tparts = list()
+			for p in partitions:
+				#print(p)
+				l1 = p.copy()
+				l1.extend(pred[0])
+				l2 = p.copy()
+				l2.extend(pred[1])
+				#print("after extension")
+				#print(l1)
+				#print(l2)
+				tparts.append(l1)
+				tparts.append(l2)
+			partitions = tparts
+			#print("extending")
+			#print(partitions)
+	#for p in partitions:
+		#print(p)
+	return partitions
+def genAlmostUniformDistr(size, nswap=50):
+	"""
+	generate probability distribution
+	Parameters
+		size : distr size
+		nswap : no of mass swaps
+	"""
+	un = 1.0 / size
+	distr = [un] * size
+	distr = mutDistr(distr, 0.1 * un, nswap)
+	return distr
+def mutDistr(distr, shift, nswap=50):
+	"""
+	mutates a probability distribution
+	Parameters
+		distr distribution
+		shift : amount of shift for swap
+		nswap : no of mass swaps
+	"""
+	size = len(distr)
+	for _ in range(nswap):
+		fi = randomInt(0, size -1)
+		si = randomInt(0, size -1)
+		while fi == si:
+			fi = randomInt(0, size -1)
+			si = randomInt(0, size -1)
+		shift = randomFloat(0, shift)
+		t = distr[fi]
+		distr[fi] -= shift
+		if (distr[fi] < 0):
+			distr[fi] = 0.0
+			shift = t
+		distr[si] += shift
+	return distr
+def generateBinDistribution(size, ntrue):
+	"""
+	generate binary array with some elements set to 1
+	Parameters
+		size : distr size
+		ntrue : no of true values
+	"""
+	distr = [0] * size
+	idxs = selectRandomSubListFromList(list(range(size)), ntrue)
+	for i in idxs:
+		distr[i] = 1
+	return distr
+def mutBinaryDistr(distr, nmut):
+	"""
+	mutate binary distribution
+	Parameters
+		distr : distr
+		nmut : no of mutations
+	"""
+	idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)
+	for i in idxs:
+		distr[i] = distr[i] ^ 1
+	return distr
+def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=","):
+	"""
+	file record generator that superimposes given data in the specified segment of a column
+	Parameters
+		filePath ; file path
+		column : column index
+		offset : offset into column values
+		seqLen : length of subseq
+		modifier : data to be superimposed either list or a sampler object
+		precision : floating point precision
+		delim : delemeter
+	"""
+	beg = offset
+	end = beg + seqLen
+	isList = type(modifier) == list
+	i = 0
+	for rec in fileRecGen(filePath, delim):
+		if i >= beg and i < end:
+			va = float(rec[column])
+			if isList:
+				va += modifier[i - beg]
+			else:
+				va += modifier.sample()
+			rec[column] = formatFloat(precision, va)
+		yield delim.join(rec)
+		i += 1
+class ShiftedDataGenerator:
+	"""
+	transforms data for distribution shift
+	"""
+	def __init__(self, types, tdata, addFact, multFact):
+		"""
+		initializer
+		Parameters
+			types data types
+			tdata : 2D array
+			addFact ; factor for data shift
+			multFact ; factor for data scaling
+		"""
+		(self.dtypes, self.cvalues) = extractTypesFromString(types)
+		self.limits = dict()
+		for k,v in self.dtypes.items():
+			if v == "int" or v == "false":
+				(vmax, vmin) = getColMinMax(tdata, k)
+				self.limits[k] = vmax - vmin
+		self.addMin = - addFact / 2
+		self.addMax =  addFact / 2
+		self.multMin = 1.0 - multFact / 2
+		self.multMax = 1.0 + multFact / 2
+	def transform(self, tdata):
+		"""
+		linear transforms data to create  distribution shift with random shift and scale
+		Parameters
+			types : data types
+		"""
+		transforms = dict()
+		for k,v in self.dtypes.items():
+			if v == "int" or v == "false":
+				shift = randomFloat(self.addMin, self.addMax) * self.limits[k]
+				scale = randomFloat(self.multMin, self.multMax)
+				trns = (shift, scale)
+				transforms[k] = trns
+			elif v == "cat":
+				transforms[k] = isEventSampled(50)
+		ttdata = list()
+		for rec in tdata:
+			nrec = rec.copy()
+			for c in range(len(rec)):
+				if c in self.dtypes:
+					dtype = self.dtypes[c]
+					if dtype == "int" or dtype == "float":
+						(shift, scale) = transforms[c]
+						nval = shift +  rec[c] * scale
+						if dtype == "int":
+							nrec[c] = int(nval)
+						else:
+							nrec[c] = nval
+					elif dtype == "cat":
+						cv = self.cvalues[c]
+						if transforms[c]:
+							nval = selectOtherRandomFromList(cv, rec[c])
+							nrec[c] = nval
+			ttdata.append(nrec)
+		return ttdata
+	def transformSpecified(self, tdata, sshift, scale):
+		"""
+		linear transforms data to create  distribution shift shift specified shift and scale
+		Parameters
+			types : data types
+			sshift : shift factor
+			scale : scale factor
+		"""
+		transforms = dict()
+		for k,v in self.dtypes.items():
+			if v == "int" or v == "false":
+				shift = sshift * self.limits[k]
+				trns = (shift, scale)
+				transforms[k] = trns
+			elif v == "cat":
+				transforms[k] = isEventSampled(50)
+		ttdata = self.__scaleShift(tdata, transforms)
+		return ttdata
+	def __scaleShift(self, tdata, transforms):
+		"""
+		shifts and scales tabular data
+		Parameters
+			tdata : 2D array
+			transforms : transforms to apply
+		"""
+		ttdata = list()
+		for rec in tdata:
+			nrec = rec.copy()
+			for c in range(len(rec)):
+				if c in self.dtypes:
+					dtype = self.dtypes[c]
+					if dtype == "int" or dtype == "float":
+						(shift, scale) = transforms[c]
+						nval = shift + rec[c] * scale
+						if dtype == "int":
+							nrec[c] = int(nval)
+						else:
+							nrec[c] = nval
+					elif dtype == "cat":
+						cv = self.cvalues[c]
+						if transforms[c]:
+							#nval = selectOtherRandomFromList(cv, rec[c])
+							#nrec[c] = nval
+							pass
+			ttdata.append(nrec)
+		return ttdata
+class RollingStat(object):
+	"""
+	stats for rolling windowt
+	"""
+	def __init__(self, wsize):
+		"""
+		initializer
+		Parameters
+			wsize : window size
+		"""
+		self.window = list()
+		self.wsize = wsize
+		self.mean = None
+		self.sd = None
+	def add(self, value):
+		"""
+		add a value
+		Parameters
+			value : value to add
+		"""
+		self.window.append(value)
+		if len(self.window) > self.wsize:
+			self.window = self.window[1:]
+	def getStat(self):
+		"""
+		get rolling window mean and std deviation
+		"""
+		assertGreater(len(self.window), 0, "window is empty")
+		if len(self.window) == 1:
+			self.mean = self.window[0]
+			self.sd = 0
+		else:
+			self.mean = statistics.mean(self.window)
+			self.sd = statistics.stdev(self.window, xbar=self.mean)
+		re = (self.mean, self.sd)
+		return re
+	def getSize(self):
+		"""
+		return window size
+		"""
+		return len(self.window)

matumizi/sampler.py ADDED Viewed

	@@ -0,0 +1,1455 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import sys
+import random
+import time
+import math
+import random
+import numpy as np
+from scipy import stats
+from random import randint
+from .util import *
+from .stats import Histogram
+def randomFloat(low, high):
+	"""
+	sample float within range
+	Parameters
+		low : low valuee
+		high : high valuee
+	"""
+	return random.random() * (high-low) + low
+def randomInt(minv, maxv):
+	"""
+	sample int within range
+	Parameters
+		minv : low valuee
+		maxv : high valuee
+	"""
+	return randint(minv, maxv)
+def randIndex(lData):
+	"""
+	random index of a list
+	Parameters
+		lData : list data
+	"""
+	return randint(0, len(lData)-1)
+def randomUniformSampled(low, high):
+	"""
+	sample float within range
+	Parameters
+		low : low value
+		high : high value
+	"""
+	return np.random.uniform(low, high)
+def randomUniformSampledList(low, high, size):
+	"""
+	sample floats within range to create list
+	Parameters
+		low : low value
+		high : high value
+		size ; size of list to be returned
+	"""
+	return np.random.uniform(low, high, size)
+def randomNormSampled(mean, sd):
+	"""
+	sample float from normal
+	Parameters
+		mean : mean
+		sd : std deviation
+	"""
+	return np.random.normal(mean, sd)
+def randomNormSampledList(mean, sd, size):
+	"""
+	sample float list from normal
+	Parameters
+		mean : mean
+		sd : std deviation
+		size : size of list to be returned
+	"""
+	return np.random.normal(mean, sd, size)
+def randomSampledList(sampler, size):
+	"""
+	sample list from given sampler
+	Parameters
+		sampler : sampler object
+		size : size of list to be returned
+	"""
+	return list(map(lambda i : sampler.sample(), range(size)))
+def minLimit(val, minv):
+	"""
+	min limit
+	Parameters
+		val : value
+		minv : min limit
+	"""
+	if (val < minv):
+		val = minv
+	return val
+def rangeLimit(val, minv, maxv):
+	"""
+	range limit
+	Parameters
+		val : value
+		minv : min limit
+		maxv : max limit
+	"""
+	if (val < minv):
+		val = minv
+	elif (val > maxv):
+		val = maxv
+	return val
+def sampleUniform(minv, maxv):
+	"""
+	sample int within range
+	Parameters
+		minv ; int min limit
+		maxv : int max limit
+	"""
+	return randint(minv, maxv)
+def sampleFromBase(value, dev):
+	"""
+	sample int wrt base
+	Parameters
+		value : base value
+		dev : deviation
+	"""
+	return randint(value - dev, value + dev)
+def sampleFloatFromBase(value, dev):
+	"""
+	sample float wrt base
+	Parameters
+		value : base value
+		dev : deviation
+	"""
+	return randomFloat(value - dev, value + dev)
+def distrUniformWithRanndom(total, numItems, noiseLevel):
+	"""
+	uniformly distribute with some randomness and preserves total
+	Parameters
+		total : total count
+		numItems : no of bins
+		noiseLevel : noise level fraction
+	"""
+	perItem = total / numItems
+	var = perItem * noiseLevel
+	items = []
+	for i in range(numItems):
+		item = perItem + randomFloat(-var, var)
+		items.append(item)
+	#adjust last item
+	sm = sum(items[:-1])
+	items[-1] = total - sm
+	return items
+def isEventSampled(threshold, maxv=100):
+	"""
+	sample event which occurs if sampled below threshold
+	Parameters
+		threshold : threshold for sampling
+		maxv : maximum values
+	"""
+	return randint(0, maxv) < threshold
+def sampleBinaryEvents(events, probPercent):
+	"""
+	sample binary events
+	Parameters
+		events : two events
+		probPercent : probability as percentage
+	"""
+	if (randint(0, 100) < probPercent):
+		event = events[0]
+	else:
+		event = events[1]
+	return event
+def addNoiseNum(value, sampler):
+	"""
+	add noise to numeric value
+	Parameters
+		value : base value
+		sampler : sampler for noise
+	"""
+	return value * (1 + sampler.sample())
+def addNoiseCat(value, values, noise):
+	"""
+	add noise to categorical value i.e with some probability change value
+	Parameters
+		value : cat value
+		values : cat values
+		noise : noise level fraction
+	"""
+	newValue = value
+	threshold = int(noise * 100)
+	if (isEventSampled(threshold)):
+		newValue = selectRandomFromList(values)
+		while newValue == value:
+			newValue = selectRandomFromList(values)
+	return newValue
+def sampleWithReplace(data, sampSize):
+	"""
+	sample with replacement
+	Parameters
+		data : array
+		sampSize : sample size
+	"""
+	sampled = list()
+	le = len(data)
+	if sampSize is None:
+		sampSize = le
+	for i in range(sampSize):
+		j = random.randint(0, le - 1)
+		sampled.append(data[j])
+	return sampled
+class CumDistr:
+	"""
+	cumulative distr
+	"""
+	def __init__(self, data, numBins = None):
+		"""
+		initializer
+		Parameters
+			data : array
+			numBins : no of bins
+		"""
+		if not numBins:
+			numBins = int(len(data) / 5)
+		res = stats.cumfreq(data, numbins=numBins)
+		self.cdistr = res.cumcount / len(data)
+		self.loLim = res.lowerlimit
+		self.upLim = res.lowerlimit + res.binsize * res.cumcount.size
+		self.binWidth = res.binsize
+	def getDistr(self, value):
+		"""
+		get cumulative distribution
+		Parameters
+			value : value
+		"""
+		if value <= self.loLim:
+			d = 0.0
+		elif value >= self.upLim:
+			d = 1.0
+		else:
+			bin = int((value - self.loLim) / self.binWidth)
+			d = self.cdistr[bin]
+		return d
+class BernoulliTrialSampler:
+	"""
+	bernoulli trial sampler return True or False
+	"""
+	def __init__(self, pr, events=None):
+		"""
+		initializer
+		Parameters
+			pr : probability
+			events : event values
+		"""
+		self.pr = pr
+		self.retEvent = False if events is None else True
+		self.events = events
+	def sample(self):
+		"""
+		samples value
+		"""
+		res = random.random() < self.pr
+		if self.retEvent:
+			res = self.events[0] if res else self.events[1]
+		return res
+class PoissonSampler:
+	"""
+	poisson sampler returns number of events
+	"""
+	def __init__(self, rateOccur, maxSamp):
+		"""
+		initializer
+		Parameters
+			rateOccur : rate of occurence
+			maxSamp : max limit on no of samples
+		"""
+		self.rateOccur = rateOccur
+		self.maxSamp = int(maxSamp)
+		self.pmax = self.calculatePr(rateOccur)
+	def calculatePr(self, numOccur):
+		"""
+		calulates probability
+		Parameters
+			numOccur : no of occurence
+		"""
+		p = (self.rateOccur ** numOccur) * math.exp(-self.rateOccur) / math.factorial(numOccur)
+		return p
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = 0
+		while not done:
+			no = randint(0, self.maxSamp)
+			sp = randomFloat(0.0, self.pmax)
+			ap = self.calculatePr(no)
+			if sp < ap:
+				done = True
+				samp = no
+		return samp
+class ExponentialSampler:
+	"""
+	returns interval between events
+	"""
+	def __init__(self, rateOccur, maxSamp = None):
+		"""
+		initializer
+		Parameters
+			rateOccur : rate of occurence
+			maxSamp : max limit on interval
+		"""
+		self.interval = 1.0 / rateOccur
+		self.maxSamp = int(maxSamp) if maxSamp is not None else None
+	def sample(self):
+		"""
+		samples value
+		"""
+		sampled = np.random.exponential(scale=self.interval)
+		if self.maxSamp is not None:
+			while sampled > self.maxSamp:
+				sampled = np.random.exponential(scale=self.interval)
+		return sampled
+class UniformNumericSampler:
+	"""
+	uniform sampler for numerical values
+	"""
+	def __init__(self, minv, maxv):
+		"""
+		initializer
+		Parameters
+			minv : min value
+			maxv : max value
+		"""
+		self.minv = minv
+		self.maxv = maxv
+	def isNumeric(self):
+		"""
+		returns true
+		"""
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		samp =	sampleUniform(self.minv, self.maxv) if isinstance(self.minv, int) else randomFloat(self.minv, self.maxv)
+		return samp
+class UniformCategoricalSampler:
+	"""
+	uniform sampler for categorical values
+	"""
+	def __init__(self, cvalues):
+		"""
+		initializer
+		Parameters
+			cvalues : categorical value list
+		"""
+		self.cvalues = cvalues
+	def isNumeric(self):
+		return False
+	def sample(self):
+		"""
+		samples value
+		"""
+		return selectRandomFromList(self.cvalues)
+class NormalSampler:
+	"""
+	normal sampler
+	"""
+	def __init__(self, mean, stdDev):
+		"""
+		initializer
+		Parameters
+			mean : mean
+			stdDev : std deviation
+		"""
+		self.mean = mean
+		self.stdDev = stdDev
+		self.sampleAsInt = False
+	def isNumeric(self):
+		return True
+	def sampleAsIntValue(self):
+		"""
+		set True to sample as int
+		"""
+		self.sampleAsInt = True
+	def sample(self):
+		"""
+		samples value
+		"""
+		samp =  np.random.normal(self.mean, self.stdDev)
+		if self.sampleAsInt:
+			samp = int(samp)
+		return samp
+class LogNormalSampler:
+	"""
+	log normal sampler
+	"""
+	def __init__(self, mean, stdDev):
+		"""
+		initializer
+		Parameters
+			mean : mean
+			stdDev : std deviation
+		"""
+		self.mean = mean
+		self.stdDev = stdDev
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		return np.random.lognormal(self.mean, self.stdDev)
+class NormalSamplerWithTrendCycle:
+	"""
+	normal sampler with cycle and trend
+	"""
+	def __init__(self, mean, stdDev, dmean, cycle,  step=1):
+		"""
+		initializer
+		Parameters
+			mean : mean
+			stdDev : std deviation
+			dmean : trend delta
+			cycle : cycle values wrt base mean
+			step : adjustment step for cycle and trend
+		"""
+		self.mean = mean
+		self.cmean = mean
+		self.stdDev = stdDev
+		self.dmean = dmean
+		self.cycle = cycle
+		self.clen = len(cycle) if cycle is not None else 0
+		self.step = step
+		self.count = 0
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		s = np.random.normal(self.cmean, self.stdDev)
+		self.count += 1
+		if self.count % self.step == 0:
+			cy = 0
+			if self.clen > 1:
+				coff =  self.count % self.clen
+				cy = self.cycle[coff]
+			tr = self.count * self.dmean
+			self.cmean = self.mean + tr + cy
+		return s
+class ParetoSampler:
+	"""
+	pareto sampler
+	"""
+	def __init__(self, mode, shape):
+		"""
+		initializer
+		Parameters
+			mode : mode
+			shape : shape
+		"""
+		self.mode = mode
+		self.shape = shape
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		return (np.random.pareto(self.shape) + 1) * self.mode
+class GammaSampler:
+	"""
+	pareto sampler
+	"""
+	def __init__(self, shape, scale):
+		"""
+		initializer
+		Parameters
+			shape : shape
+			scale : scale
+		"""
+		self.shape = shape
+		self.scale = scale
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		return np.random.gamma(self.shape, self.scale)
+class GaussianRejectSampler:
+	"""
+	gaussian sampling based on rejection sampling
+	"""
+	def __init__(self, mean, stdDev):
+		"""
+		initializer
+		Parameters
+			mean : mean
+			stdDev : std deviation
+		"""
+		self.mean = mean
+		self.stdDev = stdDev
+		self.xmin = mean - 3 * stdDev
+		self.xmax = mean + 3 * stdDev
+		self.ymin = 0.0
+		self.fmax = 1.0 / (math.sqrt(2.0 * 3.14) * stdDev)
+		self.ymax = 1.05 * self.fmax
+		self.sampleAsInt = False
+	def isNumeric(self):
+		return True
+	def sampleAsIntValue(self):
+		"""
+		sample as int value
+		"""
+		self.sampleAsInt = True
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = 0
+		while not done:
+			x = randomFloat(self.xmin, self.xmax)
+			y = randomFloat(self.ymin, self.ymax)
+			f = self.fmax * math.exp(-(x - self.mean) * (x - self.mean) / (2.0 * self.stdDev * self.stdDev))
+			if (y < f):
+				done = True
+				samp = x
+		if self.sampleAsInt:
+			samp = int(samp)
+		return samp
+class DiscreteRejectSampler:
+	"""
+	non parametric sampling for discrete values  using given distribution based
+	on rejection sampling
+	"""
+	def __init__(self,  xmin, xmax, step, *values):
+		"""
+		initializer
+		Parameters
+			xmin : min  value
+			xmax : max  value
+			step : discrete step
+			values : distr values
+		"""
+		self.xmin = xmin
+		self.xmax = xmax
+		self.step = step
+		self.distr = values
+		if (len(self.distr) == 1):
+			self.distr = self.distr[0]
+		numSteps = int((self.xmax - self.xmin) / self.step)
+		#print("{:.3f} {:.3f} {:.3f} {}".format(self.xmin, self.xmax, self.step, numSteps))
+		assert len(self.distr)	== numSteps + 1, "invalid number of distr values expected {}".format(numSteps + 1)
+		self.ximin = 0
+		self.ximax = numSteps
+		self.pmax = float(max(self.distr))
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = None
+		while not done:
+			xi = randint(self.ximin, self.ximax)
+			#print(formatAny(xi, "xi"))
+			ps = randomFloat(0.0, self.pmax)
+			pa = self.distr[xi]
+			if ps < pa:
+				samp = self.xmin + xi  * self.step
+				done = True
+		return samp
+class TriangularRejectSampler:
+	"""
+	non parametric sampling using triangular distribution based on rejection sampling
+	"""
+	def __init__(self, xmin, xmax, vertexValue, vertexPos=None):
+		"""
+		initializer
+		Parameters
+			xmin : min  value
+			xmax : max  value
+			vertexValue : distr value at vertex
+			vertexPos : vertex pposition
+		"""
+		self.xmin = xmin
+		self.xmax = xmax
+		self.vertexValue = vertexValue
+		if vertexPos:
+			assert vertexPos > xmin and vertexPos < xmax, "vertex position outside bound"
+			self.vertexPos = vertexPos
+		else:
+			self.vertexPos = 0.5 * (xmin + xmax)
+		self.s1 = vertexValue / (self.vertexPos - xmin)
+		self.s2 = vertexValue / (xmax - self.vertexPos)
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = None
+		while not done:
+			x = randomFloat(self.xmin, self.xmax)
+			y = randomFloat(0.0, self.vertexValue)
+			f = (x - self.xmin) * self.s1 if x < self.vertexPos else (self.xmax - x) * self.s2
+			if (y < f):
+				done = True
+				samp = x
+		return samp;
+class NonParamRejectSampler:
+	"""
+	non parametric sampling using given distribution based on rejection sampling
+	"""
+	def __init__(self, xmin, binWidth, *values):
+		"""
+		initializer
+		Parameters
+			xmin : min  value
+			binWidth : bin width
+			values : distr values
+		"""
+		self.values = values
+		if (len(self.values) == 1):
+			self.values = self.values[0]
+		self.xmin = xmin
+		self.xmax = xmin + binWidth * (len(self.values) - 1)
+		#print(self.xmin, self.xmax, binWidth)
+		self.binWidth = binWidth
+		self.fmax = 0
+		for v in self.values:
+			if (v > self.fmax):
+				self.fmax = v
+		self.ymin = 0
+		self.ymax = self.fmax
+		self.sampleAsInt = True
+	def isNumeric(self):
+		return True
+	def sampleAsFloat(self):
+		self.sampleAsInt = False
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = 0
+		while not done:
+			if self.sampleAsInt:
+				x = random.randint(self.xmin, self.xmax)
+				y = random.randint(self.ymin, self.ymax)
+			else:
+				x = randomFloat(self.xmin, self.xmax)
+				y = randomFloat(self.ymin, self.ymax)
+			bin = int((x - self.xmin) / self.binWidth)
+			f = self.values[bin]
+			if (y < f):
+				done = True
+				samp = x
+		return samp
+class JointNonParamRejectSampler:
+	"""
+	non parametric sampling using given distribution based on rejection sampling
+	"""
+	def __init__(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):
+		"""
+		initializer
+		Parameters
+			xmin : min  value for x
+			xbinWidth : bin width for x
+			xnbin : no of bins for x
+			ymin : min  value for y
+			ybinWidth : bin width for y
+			ynbin : no of bins for y
+			values : distr values
+		"""
+		self.values = values
+		if (len(self.values) == 1):
+			self.values = self.values[0]
+		assert len(self.values) ==  xnbin * ynbin, "wrong number of values for joint distr"
+		self.xmin = xmin
+		self.xmax = xmin + xbinWidth * xnbin
+		self.xbinWidth = xbinWidth
+		self.ymin = ymin
+		self.ymax = ymin + ybinWidth * ynbin
+		self.ybinWidth = ybinWidth
+		self.pmax = max(self.values)
+		self.values = np.array(self.values).reshape(xnbin, ynbin)
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = 0
+		while not done:
+			x = randomFloat(self.xmin, self.xmax)
+			y = randomFloat(self.ymin, self.ymax)
+			xbin = int((x - self.xmin) / self.xbinWidth)
+			ybin = int((y - self.ymin) / self.ybinWidth)
+			ap = self.values[xbin][ybin]
+			sp = randomFloat(0.0, self.pmax)
+			if (sp < ap):
+				done = True
+				samp = [x,y]
+		return samp
+class JointNormalSampler:
+	"""
+	joint normal sampler
+	"""
+	def __init__(self, *values):
+		"""
+		initializer
+		Parameters
+			values : 2 mean values followed by 4 values for covar matrix
+		"""
+		lvalues = list(values)
+		assert len(lvalues) == 6, "incorrect number of arguments for joint normal sampler"
+		mean = lvalues[:2]
+		self.mean = np.array(mean)
+		sd = lvalues[2:]
+		self.sd = np.array(sd).reshape(2,2)
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		return list(np.random.multivariate_normal(self.mean, self.sd))
+class MultiVarNormalSampler:
+	"""
+	muti variate normal sampler
+	"""
+	def __init__(self, numVar, *values):
+		"""
+		initializer
+		Parameters
+			numVar : no of variables
+			values : numVar mean values followed by numVar x numVar values for covar matrix
+		"""
+		lvalues = list(values)
+		assert len(lvalues) == numVar + numVar * numVar, "incorrect number of arguments for multi var normal sampler"
+		mean = lvalues[:numVar]
+		self.mean = np.array(mean)
+		sd = lvalues[numVar:]
+		self.sd = np.array(sd).reshape(numVar,numVar)
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		return list(np.random.multivariate_normal(self.mean, self.sd))
+class CategoricalRejectSampler:
+	"""
+	non parametric sampling for categorical attributes using given distribution based
+	on rejection sampling
+	"""
+	def __init__(self,  *values):
+		"""
+		initializer
+		Parameters
+			values : list of tuples which contains a categorical value and the corresponsding distr value
+		"""
+		self.distr = values
+		if (len(self.distr) == 1):
+			self.distr = self.distr[0]
+		maxv = 0
+		for t in self.distr:
+			if t[1] > maxv:
+				maxv = t[1]
+		self.maxv = maxv
+	def sample(self):
+		"""
+		samples value
+		"""
+		done = False
+		samp = ""
+		while not done:
+			t = self.distr[randint(0, len(self.distr)-1)]
+			d = randomFloat(0, self.maxv)
+			if (d <= t[1]):
+				done = True
+				samp = t[0]
+		return samp
+class CategoricalSetSampler:
+	"""
+	non parametric sampling for categorical attributes using uniform distribution based for
+	sampling a set of values from all values
+	"""
+	def __init__(self,  *values):
+		"""
+		initializer
+		Parameters
+			values : list which contains a categorical values
+		"""
+		self.values = values
+		if (len(self.values) == 1):
+			self.values = self.values[0]
+		self.sampled = list()
+	def sample(self):
+		"""
+		samples value only from previously unsamopled
+		"""
+		samp = selectRandomFromList(self.values)
+		while True:
+			if samp in self.sampled:
+				samp = selectRandomFromList(self.values)
+			else:
+				self.sampled.append(samp)
+				break
+		return samp
+	def setSampled(self, sampled):
+		"""
+		set already sampled
+		Parameters
+			sampled : already sampled list
+		"""
+		self.sampled  = sampled
+	def unsample(self, sample=None):
+		"""
+		rempve from sample history
+		Parameters
+			sample : sample to be removed
+		"""
+		if sample is None:
+			self.sampled.clear()
+		else:
+			self.sampled.remove(sample)
+class DistrMixtureSampler:
+	"""
+	distr mixture sampler
+	"""
+	def __init__(self,  mixtureWtDistr, *compDistr):
+		"""
+		initializer
+		Parameters
+			mixtureWtDistr : sampler that returns index into sampler list
+			compDistr : sampler list
+		"""
+		self.mixtureWtDistr = mixtureWtDistr
+		self.compDistr = compDistr
+		if (len(self.compDistr) == 1):
+			self.compDistr = self.compDistr[0]
+	def isNumeric(self):
+		return True
+	def sample(self):
+		"""
+		samples value
+		"""
+		comp = self.mixtureWtDistr.sample()
+		#sample  sampled comp distr
+		return self.compDistr[comp].sample()
+class AncestralSampler:
+	"""
+	ancestral sampler using conditional distribution
+	"""
+	def __init__(self,  parentDistr, childDistr, numChildren):
+		"""
+		initializer
+		Parameters
+			parentDistr : parent distr
+			childDistr : childdren distribution dictionary
+			numChildren : no of children
+		"""
+		self.parentDistr = parentDistr
+		self.childDistr = childDistr
+		self.numChildren = numChildren
+	def sample(self):
+		"""
+		samples value
+		"""
+		parent = self.parentDistr.sample()
+		#sample all children conditioned on parent
+		children = []
+		for i in range(self.numChildren):
+			key = (parent, i)
+			child = self.childDistr[key].sample()
+			children.append(child)
+		return (parent, children)
+class ClusterSampler:
+	"""
+	sample cluster and then sample member of sampled cluster
+	"""
+	def __init__(self,  clusters, *clustDistr):
+		"""
+		initializer
+		Parameters
+			clusters : dictionary clusters
+			clustDistr : distr for clusters
+		"""
+		self.sampler = CategoricalRejectSampler(*clustDistr)
+		self.clusters = clusters
+	def sample(self):
+		"""
+		samples value
+		"""
+		cluster = self.sampler.sample()
+		member = random.choice(self.clusters[cluster])
+		return (cluster, member)
+class MetropolitanSampler:
+	"""
+	metropolitan sampler
+	"""
+	def __init__(self, propStdDev, min, binWidth, values):
+		"""
+		initializer
+		Parameters
+			propStdDev : proposal distr std dev
+			min : min domain value for target distr
+			binWidth : bin width
+			values : target distr values
+		"""
+		self.targetDistr = Histogram.createInitialized(min, binWidth, values)
+		self.propsalDistr = GaussianRejectSampler(0, propStdDev)
+		self.proposalMixture = False
+		# bootstrap sample
+		(minv, maxv) = self.targetDistr.getMinMax()
+		self.curSample = random.randint(minv, maxv)
+		self.curDistr = self.targetDistr.value(self.curSample)
+		self.transCount = 0
+	def initialize(self):
+		"""
+		initialize
+		"""
+		(minv, maxv) = self.targetDistr.getMinMax()
+		self.curSample = random.randint(minv, maxv)
+		self.curDistr = self.targetDistr.value(self.curSample)
+		self.transCount = 0
+	def setProposalDistr(self, propsalDistr):
+		"""
+		set custom proposal distribution
+		Parameters
+			propsalDistr : proposal distribution
+		"""
+		self.propsalDistr = propsalDistr
+	def setGlobalProposalDistr(self, globPropStdDev, proposalChoiceThreshold):
+		"""
+		set custom proposal distribution
+		Parameters
+			globPropStdDev : global proposal distr std deviation
+			proposalChoiceThreshold : threshold for using global proposal distribution
+		"""
+		self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)
+		self.proposalChoiceThreshold = proposalChoiceThreshold
+		self.proposalMixture = True
+	def sample(self):
+		"""
+		samples value
+		"""
+		nextSample = self.proposalSample(1)
+		self.targetSample(nextSample)
+		return self.curSample;
+	def proposalSample(self, skip):
+		"""
+		sample from proposal distribution
+		Parameters
+			skip : no of samples to skip
+		"""
+		for i in range(skip):
+			if not self.proposalMixture:
+				#one proposal distr
+				nextSample = self.curSample + self.propsalDistr.sample()
+				nextSample = self.targetDistr.boundedValue(nextSample)
+			else:
+				#mixture of proposal distr
+				if random.random() < self.proposalChoiceThreshold:
+					nextSample = self.curSample + self.propsalDistr.sample()
+				else:
+					nextSample = self.curSample + self.globalProposalDistr.sample()
+				nextSample = self.targetDistr.boundedValue(nextSample)
+		return nextSample
+	def targetSample(self, nextSample):
+		"""
+		target sample
+		Parameters
+			nextSample : proposal distr sample
+		"""
+		nextDistr = self.targetDistr.value(nextSample)
+		transition = False
+		if nextDistr > self.curDistr:
+			transition = True
+		else:
+			distrRatio = float(nextDistr) / self.curDistr
+			if random.random() < distrRatio:
+				transition = True
+		if transition:
+			self.curSample = nextSample
+			self.curDistr = nextDistr
+			self.transCount += 1
+	def subSample(self, skip):
+		"""
+		sub sample
+		Parameters
+			skip : no of samples to skip
+		"""
+		nextSample = self.proposalSample(skip)
+		self.targetSample(nextSample)
+		return self.curSample;
+	def setMixtureProposal(self, globPropStdDev, mixtureThreshold):
+		"""
+		mixture proposal
+		Parameters
+			globPropStdDev : global proposal distr std deviation
+			mixtureThreshold : threshold for using global proposal distribution
+		"""
+		self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)
+		self.mixtureThreshold = mixtureThreshold
+	def samplePropsal(self):
+		"""
+		sample from proposal distr
+		"""
+		if self.globalPropsalDistr is None:
+			proposal = self.propsalDistr.sample()
+		else:
+			if random.random() < self.mixtureThreshold:
+				proposal = self.propsalDistr.sample()
+			else:
+				proposal = self.globalProposalDistr.sample()
+		return proposal
+class PermutationSampler:
+	"""
+	permutation sampler by shuffling a list
+	"""
+	def __init__(self):
+		"""
+		initialize
+		"""
+		self.values = None
+		self.numShuffles = None
+	@staticmethod
+	def createSamplerWithValues(values, *numShuffles):
+		"""
+		creator with values
+		Parameters
+			values : list data
+			numShuffles : no of shuffles or range of no of shuffles
+		"""
+		sampler = PermutationSampler()
+		sampler.values = values
+		sampler.numShuffles = numShuffles
+		return sampler
+	@staticmethod
+	def createSamplerWithRange(minv, maxv, *numShuffles):
+		"""
+		creator with ramge min and max
+		Parameters
+			minv : min of range
+			maxv : max of range
+			numShuffles : no of shuffles or range of no of shuffles
+		"""
+		sampler = PermutationSampler()
+		sampler.values = list(range(minv, maxv + 1))
+		sampler.numShuffles = numShuffles
+		return sampler
+	def sample(self):
+		"""
+		sample new permutation
+		"""
+		cloned = self.values.copy()
+		shuffle(cloned, *self.numShuffles)
+		return cloned
+class SpikeyDataSampler:
+	"""
+	samples spikey data
+	"""
+	def __init__(self, intvMean, intvScale, distr, spikeValueMean, spikeValueStd, spikeMaxDuration, baseValue = 0):
+		"""
+		initializer
+		Parameters
+			intvMean : interval mean
+			intvScale : interval std dev
+			distr : type of distr for interval
+			spikeValueMean : spike value mean
+			spikeValueStd : spike value std dev
+			spikeMaxDuration : max duration for spike
+			baseValue : base or offset value
+		"""
+		if distr == "norm":
+			self.intvSampler = NormalSampler(intvMean, intvScale)
+		elif distr == "expo":
+			rate = 1.0 / intvScale
+			self.intvSampler = ExponentialSampler(rate)
+		else:
+			raise ValueError("invalid distribution")
+		self.spikeSampler = NormalSampler(spikeValueMean, spikeValueStd)
+		self.spikeMaxDuration = spikeMaxDuration
+		self.baseValue = baseValue
+		self.inSpike = False
+		self.spikeCount = 0
+		self.baseCount = 0
+		self.baseLength = int(self.intvSampler.sample())
+		self.spikeValues = list()
+		self.spikeLength = None
+	def sample(self):
+		"""
+		sample new value
+		"""
+		if self.baseCount <= self.baseLength:
+			sampled = self.baseValue
+			self.baseCount += 1
+		else:
+			if not self.inSpike:
+				#starting spike
+				spikeVal = self.spikeSampler.sample()
+				self.spikeLength = sampleUniform(1, self.spikeMaxDuration)
+				spikeMaxPos = 0 if self.spikeLength == 1 else sampleUniform(0, self.spikeLength-1)
+				self.spikeValues.clear()
+				for i in range(self.spikeLength):
+					if i < spikeMaxPos:
+						frac = (i + 1) / (spikeMaxPos + 1)
+						frac = sampleFloatFromBase(frac, 0.1 * frac)
+					elif i > spikeMaxPos:
+						frac =  (self.spikeLength - i) / (self.spikeLength - spikeMaxPos)
+						frac = sampleFloatFromBase(frac, 0.1 * frac)
+					else:
+						frac = 1.0
+					self.spikeValues.append(frac * spikeVal)
+					self.inSpike = True
+					self.spikeCount = 0
+			sampled = self.spikeValues[self.spikeCount]
+			self.spikeCount += 1
+			if self.spikeCount == self.spikeLength:
+				#ending spike
+				self.baseCount = 0
+				self.baseLength = int(self.intvSampler.sample())
+				self.inSpike = False
+		return sampled
+class EventSampler:
+	"""
+	sample event
+	"""
+	def __init__(self, intvSampler, valSampler=None):
+		"""
+		initializer
+		Parameters
+			intvSampler : interval sampler
+			valSampler : value sampler
+		"""
+		self.intvSampler = intvSampler
+		self.valSampler = valSampler
+		self.trigger = int(self.intvSampler.sample())
+		self.count = 0
+	def reset(self):
+		"""
+		reset trigger
+		"""
+		self.trigger = int(self.intvSampler.sample())
+		self.count = 0
+	def sample(self):
+		"""
+		sample event
+		"""
+		if self.count == self.trigger:
+			sampled = self.valSampler.sample() if self.valSampler is not None else 1.0
+			self.trigger = int(self.intvSampler.sample())
+			self.count = 0
+		else:
+			sample = 0.0
+			self.count += 1
+		return sampled
+def createSampler(data):
+	"""
+	create sampler
+	Parameters
+		data : sampler description
+	"""
+	#print(data)
+	items = data.split(":")
+	size = len(items)
+	dtype = items[-1]
+	stype = items[-2]
+	#print("sampler data {}".format(data))
+	#print("sampler {}".format(stype))
+	sampler = None
+	if stype == "uniform":
+		if dtype == "int":
+			min = int(items[0])
+			max = int(items[1])
+			sampler = UniformNumericSampler(min, max)
+		elif dtype == "float":
+			min = float(items[0])
+			max = float(items[1])
+			sampler = UniformNumericSampler(min, max)
+		elif dtype == "categorical":
+			values = items[:-2]
+			sampler = UniformCategoricalSampler(values)
+	elif stype == "normal":
+			mean = float(items[0])
+			sd = float(items[1])
+			sampler = NormalSampler(mean, sd)
+			if dtype == "int":
+				sampler.sampleAsIntValue()
+	elif stype == "nonparam":
+		if dtype == "int" or dtype == "float":
+			min = int(items[0])
+			binWidth = int(items[1])
+			values = items[2:-2]
+			values = list(map(lambda v: int(v), values))
+			sampler = NonParamRejectSampler(min, binWidth, values)
+			if dtype == "float":
+				sampler.sampleAsFloat()
+		elif dtype == "categorical":
+			values = list()
+			for i in range(0, size-2, 2):
+				cval = items[i]
+				dist = int(items[i+1])
+				pair = (cval, dist)
+				values.append(pair)
+			sampler = CategoricalRejectSampler(values)
+		elif dtype == "scategorical":
+			vfpath = items[0]
+			values = getFileLines(vfpath, None)
+			sampler = CategoricalSetSampler(values)
+	elif stype == "discrete":
+		vmin = int(items[0])
+		vmax = int(items[1])
+		step = int(items[2])
+		values = list(map(lambda i : int(items[i]), range(3, len(items)-2)))
+		sampler = DiscreteRejectSampler(vmin, vmax, step, values)
+	elif stype == "bernauli":
+		pr = float(items[0])
+		events = None
+		if len(items) == 5:
+			events = list()
+			if dtype == "int":
+				events.append(int(items[1]))
+				events.append(int(items[2]))
+			elif dtype == "categorical":
+				events.append(items[1])
+				events.append(items[2])
+		sampler = BernoulliTrialSampler(pr, events)
+	else:
+		raise ValueError("invalid sampler type " + stype)
+	return sampler

matumizi/stats.py ADDED Viewed

	@@ -0,0 +1,496 @@

+#!/usr/local/bin/python3
+# avenir-python: Machine Learning
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import sys
+import random
+import time
+import math
+import numpy as np
+import statistics
+from .util import *
+"""
+histogram class
+"""
+class Histogram:
+	def __init__(self, min, binWidth):
+		"""
+    	initializer
+		Parameters
+			min : min x
+			binWidth : bin width
+    	"""
+		self.xmin = min
+		self.binWidth = binWidth
+		self.normalized = False
+	@classmethod
+	def createInitialized(cls, xmin, binWidth, values):
+		"""
+    	create histogram instance with min domain, bin width and values
+		Parameters
+			min : min x
+			binWidth : bin width
+			values : y values
+    	"""
+		instance = cls(xmin, binWidth)
+		instance.xmax = xmin + binWidth * (len(values) - 1)
+		instance.ymin = 0
+		instance.bins = np.array(values)
+		instance.fmax = 0
+		for v in values:
+			if (v > instance.fmax):
+				instance.fmax = v
+		instance.ymin = 0.0
+		instance.ymax = instance.fmax
+		return instance
+	@classmethod
+	def createWithNumBins(cls, values, numBins=20):
+		"""
+    	create histogram instance values and no of bins
+		Parameters
+			values : y values
+			numBins : no of bins
+		"""
+		xmin = min(values)
+		xmax = max(values)
+		binWidth = (xmax + .01 - (xmin - .01)) / numBins
+		instance = cls(xmin, binWidth)
+		instance.xmax = xmax
+		instance.numBin = numBins
+		instance.bins = np.zeros(instance.numBin)
+		for v in values:
+			instance.add(v)
+		return instance
+	@classmethod
+	def createUninitialized(cls, xmin, xmax, binWidth):
+		"""
+    	create histogram instance with no y values using domain min , max and bin width
+		Parameters
+			min : min x
+			max : max x
+			binWidth : bin width
+    	"""
+		instance = cls(xmin, binWidth)
+		instance.xmax = xmax
+		instance.numBin = (xmax - xmin) / binWidth + 1
+		instance.bins = np.zeros(instance.numBin)
+		return instance
+	def initialize(self):
+		"""
+    	set y values to 0
+    	"""
+		self.bins = np.zeros(self.numBin)
+	def add(self, value):
+		"""
+    	adds a value to a bin
+		Parameters
+			value : value
+    	"""
+		bin = int((value - self.xmin) / self.binWidth)
+		if (bin < 0 or  bin > self.numBin - 1):
+			print (bin)
+			raise ValueError("outside histogram range")
+		self.bins[bin] += 1.0
+	def normalize(self):
+		"""
+    	normalize  bin counts
+    	"""
+		if not self.normalized:
+			total = self.bins.sum()
+			self.bins = np.divide(self.bins, total)
+			self.normalized = True
+	def cumDistr(self):
+		"""
+    	cumulative dists
+    	"""
+		self.normalize()
+		self.cbins = np.cumsum(self.bins)
+		return self.cbins
+	def distr(self):
+		"""
+    	distr
+    	"""
+		self.normalize()
+		return self.bins
+	def percentile(self, percent):
+		"""
+    	return value corresponding to a percentile
+		Parameters
+			percent : percentile value
+    	"""
+		if self.cbins is None:
+			raise ValueError("cumulative distribution is not available")
+		for i,cuml in enumerate(self.cbins):
+			if percent > cuml:
+				value = (i * self.binWidth) - (self.binWidth / 2) + \
+				(percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1])
+				break
+		return value
+	def max(self):
+		"""
+    	return max bin value
+    	"""
+		return self.bins.max()
+	def value(self, x):
+		"""
+    	return a bin value
+		Parameters
+			x : x value
+   		"""
+		bin = int((x - self.xmin) / self.binWidth)
+		f = self.bins[bin]
+		return f
+	def bin(self, x):
+		"""
+    	return a bin index
+		Parameters
+			x : x value
+   		"""
+		return int((x - self.xmin) / self.binWidth)
+	def cumValue(self, x):
+		"""
+    	return a cumulative bin value
+		Parameters
+			x : x value
+   		"""
+		bin = int((x - self.xmin) / self.binWidth)
+		c = self.cbins[bin]
+		return c
+	def getMinMax(self):
+		"""
+    	returns x min and x max
+    	"""
+		return (self.xmin, self.xmax)
+	def boundedValue(self, x):
+		"""
+    	return x bounde by min and max
+		Parameters
+			x : x value
+   		"""
+		if x < self.xmin:
+			x = self.xmin
+		elif x > self.xmax:
+			x = self.xmax
+		return x
+"""
+categorical histogram class
+"""
+class CatHistogram:
+	def __init__(self):
+		"""
+    	initializer
+    	"""
+		self.binCounts = dict()
+		self.counts = 0
+		self.normalized = False
+	def add(self, value):
+		"""
+		adds a value to a bin
+		Parameters
+			x : x value
+		"""
+		addToKeyedCounter(self.binCounts, value)
+		self.counts += 1
+	def normalize(self):
+		"""
+		normalize
+		"""
+		if not self.normalized:
+			self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))
+			self.normalized = True
+	def getMode(self):
+		"""
+		get mode
+		"""
+		maxk = None
+		maxv = 0
+		#print(self.binCounts)
+		for  k,v  in  self.binCounts.items():
+			if v > maxv:
+				maxk = k
+				maxv = v
+		return (maxk, maxv)
+	def getEntropy(self):
+		"""
+		get entropy
+		"""
+		self.normalize()
+		entr = 0
+		#print(self.binCounts)
+		for  k,v  in  self.binCounts.items():
+			entr -= v * math.log(v)
+		return entr
+	def getUniqueValues(self):
+		"""
+		get unique values
+		"""
+		return list(self.binCounts.keys())
+	def getDistr(self):
+		"""
+		get distribution
+		"""
+		self.normalize()
+		return self.binCounts.copy()
+class RunningStat:
+	"""
+	running stat class
+	"""
+	def __init__(self):
+   		"""
+    	initializer
+   		"""
+   		self.sum = 0.0
+   		self.sumSq = 0.0
+   		self.count = 0
+	@staticmethod
+	def create(count, sum, sumSq):
+		"""
+    	creates iinstance
+		Parameters
+			sum : sum of values
+			sumSq : sum of valure squared
+		"""
+		rs = RunningStat()
+		rs.sum = sum
+		rs.sumSq = sumSq
+		rs.count = count
+		return rs
+	def add(self, value):
+		"""
+		adds new value
+		Parameters
+			value : value to add
+		"""
+		self.sum += value
+		self.sumSq += (value * value)
+		self.count += 1
+	def getStat(self):
+		"""
+		return mean and std deviation
+		"""
+		mean = self.sum /self. count
+		t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
+		sd = math.sqrt(t)
+		re = (mean, sd)
+		return re
+	def addGetStat(self,value):
+		"""
+		calculate mean and std deviation with new value added
+		Parameters
+			value : value to add
+		"""
+		self.add(value)
+		re = self.getStat()
+		return re
+	def getCount(self):
+		"""
+		return count
+		"""
+		return self.count
+	def getState(self):
+		"""
+		return state
+		"""
+		s = (self.count, self.sum, self.sumSq)
+		return s
+class SlidingWindowStat:
+	"""
+	sliding window stats
+	"""
+	def __init__(self):
+		"""
+		initializer
+		"""
+		self.sum = 0.0
+		self.sumSq = 0.0
+		self.count = 0
+		self.values = None
+	@staticmethod
+	def create(values, sum, sumSq):
+		"""
+    	creates iinstance
+		Parameters
+			sum : sum of values
+			sumSq : sum of valure squared
+		"""
+		sws = SlidingWindowStat()
+		sws.sum = sum
+		sws.sumSq = sumSq
+		self.values = values.copy()
+		sws.count = len(self.values)
+		return sws
+	@staticmethod
+	def initialize(values):
+		"""
+    	creates iinstance
+		Parameters
+			values : list of values
+		"""
+		sws = SlidingWindowStat()
+		sws.values = values.copy()
+		for v in sws.values:
+			sws.sum += v
+			sws.sumSq += v * v
+		sws.count = len(sws.values)
+		return sws
+	@staticmethod
+	def createEmpty(count):
+		"""
+    	creates iinstance
+		Parameters
+			count : count of values
+		"""
+		sws = SlidingWindowStat()
+		sws.count = count
+		sws.values = list()
+		return sws
+	def add(self, value):
+		"""
+		adds new value
+		Parameters
+			value : value to add
+		"""
+		self.values.append(value)
+		if len(self.values) > self.count:
+			self.sum += value - self.values[0]
+			self.sumSq += (value * value) - (self.values[0] * self.values[0])
+			self.values.pop(0)
+		else:
+			self.sum += value
+			self.sumSq += (value * value)
+	def getStat(self):
+		"""
+		calculate mean and std deviation
+		"""
+		mean = self.sum /self. count
+		t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
+		sd = math.sqrt(t)
+		re = (mean, sd)
+		return re
+	def addGetStat(self,value):
+		"""
+		calculate mean and std deviation with new value added
+		"""
+		self.add(value)
+		re = self.getStat()
+		return re
+	def getCount(self):
+		"""
+		return count
+		"""
+		return self.count
+	def getCurSize(self):
+		"""
+		return count
+		"""
+		return len(self.values)
+	def getState(self):
+		"""
+		return state
+		"""
+		s = (self.count, self.sum, self.sumSq)
+		return s
+def basicStat(ldata):
+	"""
+	mean and std dev
+	Parameters
+		ldata : list of values
+	"""
+	m = statistics.mean(ldata)
+	s = statistics.stdev(ldata, xbar=m)
+	r = (m, s)
+	return r
+def getFileColumnStat(filePath, col, delem=","):
+	"""
+	gets stats for a file column
+	Parameters
+		filePath : file path
+		col : col index
+		delem : field delemter
+	"""
+	rs = RunningStat()
+	for rec in fileRecGen(filePath, delem):
+		va = float(rec[col])
+		rs.add(va)
+	return rs.getStat()

matumizi/util.py ADDED Viewed

	@@ -0,0 +1,2345 @@

+#!/usr/local/bin/python3
+# Author: Pranab Ghosh
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You may
+# obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import os
+import sys
+from random import randint
+import random
+import time
+import uuid
+from datetime import datetime
+import math
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import logging
+import logging.handlers
+import pickle
+from contextlib import contextmanager
+tokens = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F","G","H","I","J","K","L","M",
+	"N","O","P","Q","R","S","T","U","V","W","X","Y","Z","0","1","2","3","4","5","6","7","8","9"]
+numTokens = tokens[:10]
+alphaTokens = tokens[10:36]
+loCaseChars = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k","l","m","n","o",
+"p","q","r","s","t","u","v","w","x","y","z"]
+typeInt = "int"
+typeFloat = "float"
+typeString = "string"
+secInMinute = 60
+secInHour = 60 * 60
+secInDay = 24 * secInHour
+secInWeek = 7 * secInDay
+secInYear = 365 * secInDay
+secInMonth = secInYear / 12
+minInHour = 60
+minInDay = 24 * minInHour
+ftPerYard = 3
+ftPerMile = ftPerYard * 1760
+def genID(size):
+	"""
+	generates ID
+	Parameters
+		size : size of ID
+	"""
+	id = ""
+	for i in range(size):
+		id = id + selectRandomFromList(tokens)
+	return id
+def genIdList(numId, idSize):
+	"""
+	generate list of IDs
+	Parameters:
+		numId: number of Ids
+		idSize: ID size
+	"""
+	iDs = []
+	for i in range(numId):
+		iDs.append(genID(idSize))
+	return iDs
+def genNumID(size):
+	"""
+	generates ID consisting of digits onl
+	Parameters
+		size : size of ID
+	"""
+	id = ""
+	for i in range(size):
+		id = id + selectRandomFromList(numTokens)
+	return id
+def genLowCaseID(size):
+	"""
+	generates ID consisting of lower case chars
+	Parameters
+		size : size of ID
+	"""
+	id = ""
+	for i in range(size):
+		id = id + selectRandomFromList(loCaseChars)
+	return id
+def genNumIdList(numId, idSize):
+	"""
+	generate list of numeric IDs
+	Parameters:
+		numId: number of Ids
+		idSize: ID size
+	"""
+	iDs = []
+	for i in range(numId):
+		iDs.append(genNumID(idSize))
+	return iDs
+def genNameInitial():
+	"""
+	generate name initial
+	"""
+	return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)
+def genPhoneNum(arCode):
+	"""
+	generates phone number
+	Parameters
+		arCode: area code
+	"""
+	phNum = genNumID(7)
+	return arCode + str(phNum)
+def selectRandomFromList(ldata):
+	"""
+	select an element randomly from a lis
+	Parameters
+		ldata : list data
+	"""
+	return ldata[randint(0, len(ldata)-1)]
+def selectOtherRandomFromList(ldata, cval):
+	"""
+	select an element randomly from a list excluding the given one
+	Parameters
+		ldata : list data
+		cval : value to be excluded
+	"""
+	nval = selectRandomFromList(ldata)
+	while nval == cval:
+		nval = selectRandomFromList(ldata)
+	return nval
+def selectRandomSubListFromList(ldata, num):
+	"""
+	generates random sublist from a list without replacemment
+	Parameters
+		ldata : list data
+		num : output list size
+	"""
+	assertLesser(num, len(ldata), "size of sublist to be sampled greater than or equal to main list")
+	i = randint(0, len(ldata)-1)
+	sel = ldata[i]
+	selSet = {i}
+	selList = [sel]
+	while (len(selSet) < num):
+		i = randint(0, len(ldata)-1)
+		if (i not in selSet):
+			sel = ldata[i]
+			selSet.add(i)
+			selList.append(sel)
+	return selList
+def selectRandomSubListFromListWithRepl(ldata, num):
+	"""
+	generates random sublist from a list with replacemment
+	Parameters
+		ldata : list data
+		num : output list size
+	"""
+	return list(map(lambda i : selectRandomFromList(ldata), range(num)))
+def selectRandomFromDict(ddata):
+	"""
+	select an element randomly from a dictionary
+	Parameters
+		ddata : dictionary data
+	"""
+	dkeys = list(ddata.keys())
+	dk = selectRandomFromList(dkeys)
+	el = (dk, ddata[dk])
+	return el
+def setListRandomFromList(ldata, ldataRepl):
+	"""
+	sets some elents in the first list randomly with elements from the second list
+	Parameters
+		ldata : list data
+		ldataRepl : list with replacement data
+	"""
+	l = len(ldata)
+	selSet = set()
+	for d in ldataRepl:
+		i = randint(0, l-1)
+		while i in selSet:
+			i = randint(0, l-1)
+		ldata[i] = d
+		selSet.add(i)
+def genIpAddress():
+	"""
+	generates IP address
+	"""
+	i1 = randint(0,256)
+	i2 = randint(0,256)
+	i3 = randint(0,256)
+	i4 = randint(0,256)
+	ip = "%d.%d.%d.%d" %(i1,i2,i3,i4)
+	return ip
+def curTimeMs():
+	"""
+	current time in ms
+	"""
+	return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)
+def secDegPolyFit(x1, y1, x2, y2, x3, y3):
+	"""
+	second deg polynomial
+	Parameters
+		x1 : 1st point x
+		y1 : 1st point y
+		x2 : 2nd point x
+		y2 : 2nd point y
+		x3 : 3rd point x
+		y3 : 3rd point y
+	"""
+	t = (y1 - y2) / (x1 - x2)
+	a = t - (y2 - y3) / (x2 - x3)
+	a = a / (x1 - x3)
+	b = t - a * (x1 + x2)
+	c = y1 - a * x1 * x1 - b * x1
+	return (a, b, c)
+def range_limit(val, minv, maxv):
+	"""
+	range limit a value
+	Parameters
+		val : data value
+		minv : minimum
+		maxv : maximum
+	"""
+	if (val < minv):
+		val = minv
+	elif (val > maxv):
+		val = maxv
+	return val
+def rangeLimit(val, minv, maxv):
+	"""
+	range limit a value
+	Parameters
+		val : data value
+		minv : minimum
+		maxv : maximum
+	"""
+	return range_limit(val, minv, maxv)
+def isInRange(val, minv, maxv):
+	"""
+	checks if within range
+	Parameters
+		val : data value
+		minv : minimum
+		maxv : maximum
+	"""
+	return val >= minv and val <= maxv
+def stripFileLines(filePath, offset):
+	"""
+	strips number of chars from both ends
+	Parameters
+		filePath : file path
+		offset : offset from both ends of  line
+	"""
+	fp = open(filePath, "r")
+	for line in fp:
+		stripped = line[offset:len(line) - 1 - offset]
+		print (stripped)
+	fp.close()
+def genLatLong(lat1, long1, lat2, long2):
+	"""
+	generate lat log within limits
+	Parameters
+		lat1 : lat of 1st point
+		long1 : long of 1st point
+		lat2 : lat of 2nd point
+		long2 : long of 2nd point
+	"""
+	lat = lat1 + (lat2 - lat1) * random.random()
+	longg = long1 + (long2 - long1) * random.random()
+	return (lat, longg)
+def geoDistance(lat1, long1, lat2, long2):
+	"""
+	find geo distance in ft
+	Parameters
+		lat1 : lat of 1st point
+		long1 : long of 1st point
+		lat2 : lat of 2nd point
+		long2 : long of 2nd point
+	"""
+	latDiff = math.radians(lat1 - lat2)
+	longDiff = math.radians(long1 - long2)
+	l1 = math.sin(latDiff/2.0)
+	l2 = math.sin(longDiff/2.0)
+	l3 = math.cos(math.radians(lat1))
+	l4 = math.cos(math.radians(lat2))
+	a = l1 * l1 + l3 * l4 * l2 * l2
+	l5 = math.sqrt(a)
+	l6 = math.sqrt(1.0 - a)
+	c = 2.0 * math.atan2(l5, l6)
+	r = 6371008.8 * 3.280840
+	return c * r
+def minLimit(val, limit):
+	"""
+	min limit
+	Parameters
+	"""
+	if (val < limit):
+		val = limit
+	return val;
+def maxLimit(val, limit):
+	"""
+	max limit
+	Parameters
+	"""
+	if (val > limit):
+		val = limit
+	return val;
+def rangeSample(val, minLim, maxLim):
+	"""
+	if out side range sample within range
+	Parameters
+		val : value
+		minLim : minimum
+		maxLim : maximum
+	"""
+	if val < minLim or val > maxLim:
+		val = randint(minLim, maxLim)
+	return val
+def genRandomIntListWithinRange(size, minLim, maxLim):
+	"""
+	random unique list of integers within range
+	Parameters
+		size : size of returned list
+		minLim : minimum
+		maxLim : maximum
+	"""
+	values = set()
+	for i in range(size):
+		val = randint(minLim, maxLim)
+		while val not in values:
+			values.add(val)
+	return list(values)
+def preturbScalar(value, vrange, distr="uniform"):
+	"""
+	preturbs a mutiplicative value within range
+	Parameters
+		value : data value
+		vrange : value delta  fraction
+		distr : noise distribution type
+	"""
+	if distr == "uniform":
+		scale = 1.0 - vrange + 2 * vrange * random.random()
+	elif distr == "normal":
+		scale = 1.0 + np.random.normal(0, vrange)
+	else:
+		exisWithMsg("unknown noise distr " + distr)
+	return value * scale
+def preturbScalarAbs(value, vrange):
+	"""
+	preturbs an absolute value within range
+	Parameters
+		value : data value
+		vrange : value delta  absolute
+	"""
+	delta = - vrange + 2.0 * vrange * random.random()
+	return value + delta
+def preturbVector(values, vrange):
+	"""
+	preturbs a list within range
+	Parameters
+		values : list data
+		vrange : value delta  fraction
+	"""
+	nValues = list(map(lambda va: preturbScalar(va, vrange), values))
+	return nValues
+def randomShiftVector(values, smin, smax):
+	"""
+	shifts  a list by a random quanity with a range
+	Parameters
+		values : list data
+		smin : samplinf minimum
+		smax : sampling maximum
+	"""
+	shift = np.random.uniform(smin, smax)
+	return list(map(lambda va: va + shift, values))
+def floatRange(beg, end, incr):
+	"""
+	generates float range
+	Parameters
+		beg :range begin
+		end: range end
+		incr : range increment
+	"""
+	return list(np.arange(beg, end, incr))
+def shuffle(values, *numShuffles):
+	"""
+	in place shuffling with swap of pairs
+	Parameters
+		values : list data
+		numShuffles : parameter list for number of shuffles
+	"""
+	size = len(values)
+	if len(numShuffles) == 0:
+		numShuffle = int(size / 2)
+	elif len(numShuffles) == 1:
+		numShuffle = numShuffles[0]
+	else:
+		numShuffle = randint(numShuffles[0], numShuffles[1])
+	print("numShuffle {}".format(numShuffle))
+	for i in range(numShuffle):
+		first = random.randint(0, size - 1)
+		second = random.randint(0, size - 1)
+		while first == second:
+			second = random.randint(0, size - 1)
+		tmp = values[first]
+		values[first] = values[second]
+		values[second] = tmp
+def splitList(itms, numGr):
+	"""
+	splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen
+	Parameters
+		itms ; list of values
+		numGr : no of groups
+	"""
+	tcount = len(itms)
+	cItems = list(itms)
+	sz = int(len(cItems) / numGr)
+	groups = list()
+	count = 0
+	for i in range(numGr):
+		if (i == numGr - 1):
+			csz = tcount - count
+		else:
+			csz = sz + randint(-2, 2)
+			count += csz
+		gr = list()
+		for  j in range(csz):
+			it = selectRandomFromList(cItems)
+			gr.append(it)
+			cItems.remove(it)
+		groups.append(gr)
+	return groups
+def multVector(values, vrange):
+	"""
+	multiplies a list within value  range
+	Parameters
+		values : list of values
+		vrange : fraction of vaue to be used to update
+	"""
+	scale = 1.0 - vrange + 2 * vrange * random.random()
+	nValues = list(map(lambda va: va * scale, values))
+	return nValues
+def weightedAverage(values, weights):
+	"""
+	calculates weighted average
+	Parameters
+		values : list of values
+		weights : list of weights
+	"""
+	assert len(values) == len(weights), "values and weights should be same size"
+	vw = zip(values, weights)
+	wva = list(map(lambda e : e[0] * e[1], vw))
+	#wa = sum(x * y for x, y in vw) / sum(weights)
+	wav = sum(wva) / sum(weights)
+	return wav
+def extractFields(line, delim, keepIndices):
+	"""
+	breaks a line into fields and keeps only specified fileds and returns new line
+	Parameters
+		line ; deli separated string
+		delim : delemeter
+		keepIndices : list of indexes to fields to be retained
+	"""
+	items = line.split(delim)
+	newLine = []
+	for i in keepIndices:
+		newLine.append(line[i])
+	return delim.join(newLine)
+def remFields(line, delim, remIndices):
+	"""
+	removes fields from delim separated string
+	Parameters
+		line ; delemeter separated string
+		delim : delemeter
+		remIndices : list of indexes to fields to be removed
+	"""
+	items = line.split(delim)
+	newLine = []
+	for i in range(len(items)):
+		if not arrayContains(remIndices, i):
+			newLine.append(line[i])
+	return delim.join(newLine)
+def extractList(data, indices):
+	"""
+	extracts list from another list, given indices
+	Parameters
+		remIndices : list data
+		indices : list of indexes to fields to be retained
+	"""
+	if areAllFieldsIncluded(data, indices):
+		exList = data.copy()
+		#print("all indices")
+	else:
+		exList = list()
+		le = len(data)
+		for i in indices:
+			assert i < le , "index {} out of bound {}".format(i, le)
+			exList.append(data[i])
+	return exList
+def arrayContains(arr, item):
+	"""
+	checks if array contains an item
+	Parameters
+		arr : list data
+		item : item to search
+	"""
+	contains = True
+	try:
+		arr.index(item)
+	except ValueError:
+		contains = False
+	return contains
+def strToIntArray(line, delim=","):
+	"""
+	int array from delim separated string
+	Parameters
+		line ; delemeter separated string
+	"""
+	arr = line.split(delim)
+	return [int(a) for a in arr]
+def strToFloatArray(line, delim=","):
+	"""
+	float array from delim separated string
+	Parameters
+		line ; delemeter separated string
+	"""
+	arr = line.split(delim)
+	return [float(a) for a in arr]
+def strListOrRangeToIntArray(line):
+	"""
+	int array from delim separated string or range
+	Parameters
+		line ; delemeter separated string
+	"""
+	varr = line.split(",")
+	if (len(varr) > 1):
+		iarr =  list(map(lambda v: int(v), varr))
+	else:
+		vrange = line.split(":")
+		if (len(vrange) == 2):
+			lo = int(vrange[0])
+			hi = int(vrange[1])
+			iarr = list(range(lo, hi+1))
+		else:
+			iarr = [int(line)]
+	return iarr
+def toStr(val, precision):
+	"""
+	converts any type to string
+	Parameters
+		val : value
+		precision ; precision for float value
+	"""
+	if type(val) == float or type(val) == np.float64 or type(val) == np.float32:
+		format = "%" + ".%df" %(precision)
+		sVal = format %(val)
+	else:
+		sVal = str(val)
+	return sVal
+def toStrFromList(values, precision, delim=","):
+	"""
+	converts list of any type to delim separated string
+	Parameters
+		values : list data
+		precision ; precision for float value
+		delim : delemeter
+	"""
+	sValues = list(map(lambda v: toStr(v, precision), values))
+	return delim.join(sValues)
+def toIntList(values):
+	"""
+	convert to int list
+	Parameters
+		values : list data
+	"""
+	return list(map(lambda va: int(va), values))
+def toFloatList(values):
+	"""
+	convert to float list
+	Parameters
+		values : list data
+	"""
+	return list(map(lambda va: float(va), values))
+def toStrList(values, precision=None):
+	"""
+	convert to string list
+	Parameters
+		values : list data
+		precision ; precision for float value
+	"""
+	return list(map(lambda va: toStr(va, precision), values))
+def toIntFromBoolean(value):
+	"""
+	convert to int
+	Parameters
+		value : boolean value
+	"""
+	ival = 1 if value else 0
+	return ival
+def scaleBySum(ldata):
+	"""
+	scales so that sum is 1
+	Parameters
+		ldata : list data
+	"""
+	s = sum(ldata)
+	return list(map(lambda e : e/s, ldata))
+def scaleByMax(ldata):
+	"""
+	scales so that max value is 1
+	Parameters
+		ldata : list data
+	"""
+	m = max(ldata)
+	return list(map(lambda e : e/m, ldata))
+def typedValue(val, dtype=None):
+	"""
+	return typed value given string, discovers data type if not specified
+	Parameters
+		val : value
+		dtype : data type
+	"""
+	tVal = None
+	if dtype is not None:
+		if dtype == "num":
+			dtype = "int" if dtype.find(".") == -1 else "float"
+		if dtype == "int":
+			tVal = int(val)
+		elif dtype == "float":
+			tVal = float(val)
+		elif dtype == "bool":
+			tVal = bool(val)
+		else:
+			tVal = val
+	else:
+		if type(val) == str:
+			lVal = val.lower()
+			#int
+			done = True
+			try:
+				tVal = int(val)
+			except ValueError:
+				done = False
+			#float
+			if not done:
+				done = True
+				try:
+					tVal = float(val)
+				except ValueError:
+					done = False
+			#boolean
+			if not done:
+				done = True
+				if lVal == "true":
+					tVal = True
+				elif lVal == "false":
+					tVal = False
+				else:
+					done = False
+			#None
+			if not done:
+				if lVal == "none":
+					tVal = None
+				else:
+					tVal = val
+		else:
+			tVal = val
+	return tVal
+def isInt(val):
+	"""
+	return true if string is int and the typed value
+	Parameters
+		val : value
+	"""
+	valInt = True
+	try:
+		tVal = int(val)
+	except ValueError:
+		valInt = False
+		tVal = None
+	r = (valInt, tVal)
+	return r
+def isFloat(val):
+	"""
+	return true if string is float
+	Parameters
+		val : value
+	"""
+	valFloat = True
+	try:
+		tVal = float(val)
+	except ValueError:
+		valFloat = False
+	tVal = None
+	r = (valFloat, tVal)
+	return r
+def getAllFiles(dirPath):
+	"""
+	get all files recursively
+	Parameters
+		dirPath : directory path
+	"""
+	filePaths = []
+	for (thisDir, subDirs, fileNames) in os.walk(dirPath):
+		for fileName in fileNames:
+			filePaths.append(os.path.join(thisDir, fileName))
+	filePaths.sort()
+	return filePaths
+def getFileContent(fpath, verbose=False):
+	"""
+	get file contents in directory
+	Parameters
+		fpath ; directory path
+		verbose : verbosity flag
+	"""
+	# dcument list
+	docComplete  = []
+	filePaths = getAllFiles(fpath)
+	# read files
+	for filePath in filePaths:
+		if verbose:
+			print("next file " + filePath)
+		with open(filePath, 'r') as contentFile:
+			content = contentFile.read()
+			docComplete.append(content)
+	return (docComplete, filePaths)
+def getOneFileContent(fpath):
+	"""
+	get one file contents
+	Parameters
+		fpath : file path
+	"""
+	with open(fpath, 'r') as contentFile:
+		docStr = contentFile.read()
+	return docStr
+def getFileLines(dirPath, delim=","):
+	"""
+	get lines from a file
+	Parameters
+		dirPath : file path
+		delim : delemeter
+	"""
+	lines = list()
+	for li in fileRecGen(dirPath, delim):
+		lines.append(li)
+	return lines
+def getFileSampleLines(dirPath, percen, delim=","):
+	"""
+	get sampled lines from a file
+	Parameters
+		dirPath : file path
+		percen : sampling percentage
+		delim : delemeter
+	"""
+	lines = list()
+	for li in fileRecGen(dirPath, delim):
+		if randint(0, 100) < percen:
+			lines.append(li)
+	return lines
+def getFileColumnAsString(dirPath, index, delim=","):
+	"""
+	get string column from a file
+	Parameters
+		dirPath : file path
+		index : index
+		delim : delemeter
+	"""
+	fields = list()
+	for rec in fileRecGen(dirPath, delim):
+		fields.append(rec[index])
+	#print(fields)
+	return fields
+def getFileColumnsAsString(dirPath, indexes, delim=","):
+	"""
+	get multiple string columns from a file
+	Parameters
+		dirPath : file path
+		indexes : indexes of columns
+		delim : delemeter
+	"""
+	nindex = len(indexes)
+	columns = list(map(lambda i : list(), range(nindex)))
+	for rec in fileRecGen(dirPath, delim):
+		for i in range(nindex):
+			columns[i].append(rec[indexes[i]])
+	return columns
+def getFileColumnAsFloat(dirPath, index, delim=","):
+	"""
+	get float fileds from a file
+	Parameters
+		dirPath : file path
+		index : index
+		delim : delemeter
+	"""
+	#print("{}  {}".format(dirPath, index))
+	fields = getFileColumnAsString(dirPath, index, delim)
+	return list(map(lambda v:float(v), fields))
+def getFileColumnAsInt(dirPath, index, delim=","):
+	"""
+	get float fileds from a file
+	Parameters
+		dirPath : file path
+		index : index
+		delim : delemeter
+	"""
+	fields = getFileColumnAsString(dirPath, index, delim)
+	return list(map(lambda v:int(v), fields))
+def getFileAsIntMatrix(dirPath, columns, delim=","):
+	"""
+	extracts int matrix from csv file given column indices with each row being  concatenation of
+	extracted column values row size = num of columns
+	Parameters
+		dirPath : file path
+		columns : indexes of columns
+		delim : delemeter
+	"""
+	mat = list()
+	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
+		mat.append(asIntList(rec))
+	return mat
+def getFileAsFloatMatrix(dirPath, columns, delim=","):
+	"""
+	extracts float matrix from csv file given column indices with each row being concatenation of
+	extracted column values row size = num of columns
+	Parameters
+		dirPath : file path
+		columns : indexes of columns
+		delim : delemeter
+	"""
+	mat = list()
+	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
+		mat.append(asFloatList(rec))
+	return mat
+def getFileAsFloatColumn(dirPath):
+	"""
+	grt float list from a file with one float per row
+	Parameters
+		dirPath : file path
+	"""
+	flist = list()
+	for rec in fileRecGen(dirPath, None):
+		flist.append(float(rec))
+	return flist
+def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=","):
+	"""
+	extracts float matrix from csv file given row filter and column indices with each row being
+	concatenation of  extracted column values row size = num of columns
+	Parameters
+		dirPath : file path
+		columns : indexes of columns
+		filt : row filter lambda
+		delim : delemeter
+	"""
+	mat = list()
+	for rec in  fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):
+		mat.append(asFloatList(rec))
+	return mat
+def getFileAsTypedRecords(dirPath, types, delim=","):
+	"""
+	extracts typed records from csv file with each row being concatenation of
+	extracted column values
+	Parameters
+		dirPath : file path
+		types : data types
+		delim : delemeter
+	"""
+	(dtypes, cvalues) = extractTypesFromString(types)
+	tdata = list()
+	for rec in  fileRecGen(dirPath, delim):
+		trec = list()
+		for index, value in enumerate(rec):
+			value = __convToTyped(index, value, dtypes)
+			trec.append(value)
+		tdata.append(trec)
+	return tdata
+def getFileColsAsTypedRecords(dirPath, columns, types, delim=","):
+	"""
+	extracts typed records from csv file given column indices with each row being concatenation of
+	extracted column values
+	Parameters
+	Parameters
+		dirPath : file path
+		columns : column indexes
+		types : data types
+		delim : delemeter
+	"""
+	(dtypes, cvalues) = extractTypesFromString(types)
+	tdata = list()
+	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
+		trec = list()
+		for indx, value in enumerate(rec):
+			tindx = columns[indx]
+			value = __convToTyped(tindx, value, dtypes)
+			trec.append(value)
+		tdata.append(trec)
+	return tdata
+def getFileColumnsMinMax(dirPath, columns, dtype, delim=","):
+	"""
+	extracts numeric matrix from csv file given column indices. For each column return min and max
+	Parameters
+		dirPath : file path
+		columns : column indexes
+		dtype : data type
+		delim : delemeter
+	"""
+	dtypes = list(map(lambda c : str(c) + ":" + dtype, columns))
+	dtypes = ",".join(dtypes)
+	#print(dtypes)
+	tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)
+	minMax = list()
+	ncola = len(tdata[0])
+	ncole = len(columns)
+	assertEqual(ncola, ncole, "actual no of columns different from expected")
+	for ci in range(ncole):
+		vmin = sys.float_info.max
+		vmax = sys.float_info.min
+		for r in tdata:
+			cv = r[ci]
+			vmin = cv if cv < vmin else vmin
+			vmax = cv if cv > vmax else vmax
+		mm = (vmin, vmax, vmax - vmin)
+		minMax.append(mm)
+	return minMax
+def getRecAsTypedRecord(rec, types, delim=None):
+	"""
+	converts record to  typed records
+	Parameters
+		rec : delemeter separate string or list of string
+		types : field  data types
+		delim : delemeter
+	"""
+	if delim is not None:
+		rec = rec.split(delim)
+	(dtypes, cvalues) = extractTypesFromString(types)
+	#print(types)
+	#print(dtypes)
+	trec = list()
+	for ind, value in enumerate(rec):
+		tvalue = __convToTyped(ind, value, dtypes)
+		trec.append(tvalue)
+	return trec
+def __convToTyped(index, value, dtypes):
+	"""
+	convert to typed value
+	Parameters
+		index : index in type list
+		value : data value
+		dtypes : data type list
+	"""
+	#print(index, value)
+	dtype = dtypes[index]
+	tvalue = value
+	if dtype == "int":
+		tvalue = int(value)
+	elif dtype == "float":
+		tvalue = float(value)
+	return tvalue
+def extractTypesFromString(types):
+	"""
+	extracts column data types and set values for categorical variables
+	Parameters
+		types : encoded type information
+	"""
+	ftypes = types.split(",")
+	dtypes = dict()
+	cvalues = dict()
+	for ftype in ftypes:
+		items = ftype.split(":")
+		cindex = int(items[0])
+		dtype = items[1]
+		dtypes[cindex] = dtype
+		if len(items) == 3:
+			sitems = items[2].split()
+			cvalues[cindex] = sitems
+	return (dtypes, cvalues)
+def getMultipleFileAsInttMatrix(dirPathWithCol,  delim=","):
+	"""
+	extracts int matrix from from csv files given column index for each file.
+	num of columns  = number of rows in each file and num of rows = number of files
+	Parameters
+		dirPathWithCol: list of file path and collumn index pair
+		delim : delemeter
+	"""
+	mat = list()
+	minLen = -1
+	for path, col in dirPathWithCol:
+		colVals = getFileColumnAsInt(path, col, delim)
+		if minLen < 0 or len(colVals) < minLen:
+			minLen = len(colVals)
+		mat.append(colVals)
+	#make all same length
+	mat = list(map(lambda li:li[:minLen], mat))
+	return mat
+def getMultipleFileAsFloatMatrix(dirPathWithCol,  delim=","):
+	"""
+	extracts float matrix from from csv files given column index for each file.
+	num of columns  = number of rows in each file and num of rows = number of files
+	Parameters
+		dirPathWithCol: list of file path and collumn index pair
+		delim : delemeter
+	"""
+	mat = list()
+	minLen = -1
+	for path, col in dirPathWithCol:
+		colVals = getFileColumnAsFloat(path, col, delim)
+		if minLen < 0 or len(colVals) < minLen:
+			minLen = len(colVals)
+		mat.append(colVals)
+	#make all same length
+	mat = list(map(lambda li:li[:minLen], mat))
+	return mat
+def writeStrListToFile(ldata, filePath, delem=","):
+	"""
+	writes list of dlem separated string or list of list of string to afile
+	Parameters
+		ldata : list data
+		filePath : file path
+		delim : delemeter
+	"""
+	with open(filePath, "w") as fh:
+		for r in ldata:
+			if type(r) == list:
+				r = delem.join(r)
+			fh.write(r + "\n")
+def writeFloatListToFile(ldata, prec, filePath):
+	"""
+	writes float list to file, one value per line
+	Parameters
+		ldata : list data
+		prec : precision
+		filePath : file path
+	"""
+	with open(filePath, "w") as fh:
+		for d in ldata:
+			fh.write(formatFloat(prec, d) + "\n")
+def mutateFileLines(dirPath, mutator, marg, delim=","):
+	"""
+	mutates lines from a file
+	Parameters
+		dirPath : file path
+		mutator : mutation callback
+		marg : argument for mutation call back
+		delim : delemeter
+	"""
+	lines = list()
+	for li in fileRecGen(dirPath, delim):
+		li = mutator(li) if marg is None else mutator(li, marg)
+		lines.append(li)
+	return lines
+def takeFirst(elems):
+	"""
+	return fisrt item
+	Parameters
+		elems : list of data
+	"""
+	return elems[0]
+def takeSecond(elems):
+	"""
+	return 2nd element
+	Parameters
+		elems : list of data
+	"""
+	return elems[1]
+def takeThird(elems):
+	"""
+	returns 3rd element
+	Parameters
+		elems : list of data
+	"""
+	return elems[2]
+def addToKeyedCounter(dCounter, key, count=1):
+	"""
+	add to to keyed counter
+	Parameters
+		dCounter : dictionary of counters
+		key : dictionary key
+		count : count to add
+	"""
+	curCount = dCounter.get(key, 0)
+	dCounter[key] = curCount + count
+def incrKeyedCounter(dCounter, key):
+	"""
+	increment keyed counter
+	Parameters
+		dCounter : dictionary of counters
+		key : dictionary key
+	"""
+	addToKeyedCounter(dCounter, key, 1)
+def appendKeyedList(dList, key, elem):
+	"""
+	keyed list
+	Parameters
+		dList : dictionary of lists
+		key : dictionary key
+		elem : value to append
+	"""
+	curList = dList.get(key, [])
+	curList.append(elem)
+	dList[key] = curList
+def isNumber(st):
+	"""
+	Returns True is string is a number
+	Parameters
+		st : string value
+	"""
+	return st.replace('.','',1).isdigit()
+def removeNan(values):
+	"""
+	removes nan from list
+	Parameters
+		values : list data
+	"""
+	return list(filter(lambda v: not math.isnan(v), values))
+def fileRecGen(filePath, delim = ","):
+	"""
+	file record generator
+	Parameters
+		filePath ; file path
+		delim : delemeter
+	"""
+	with open(filePath, "r") as fp:
+		for line in fp:
+			line = line[:-1]
+			if delim is not None:
+				line = line.split(delim)
+			yield line
+def fileSelFieldsRecGen(dirPath, columns, delim=","):
+	"""
+	file record generator given column indices
+	Parameters
+		filePath ; file path
+		columns : column indexes as int array or coma separated string
+		delim : delemeter
+	"""
+	if type(columns) == str:
+		columns = strToIntArray(columns, delim)
+	for rec in fileRecGen(dirPath, delim):
+		extracted = extractList(rec, columns)
+		yield extracted
+def fileSelFieldValueGen(dirPath, column, delim=","):
+	"""
+	file record generator for a given column
+	Parameters
+		filePath ; file path
+		column : column index
+		delim : delemeter
+	"""
+	for rec in fileRecGen(dirPath, delim):
+		yield rec[column]
+def fileFiltRecGen(filePath, filt, delim = ","):
+	"""
+	file record generator with  row filter applied
+	Parameters
+		filePath ; file path
+		filt : row filter
+		delim : delemeter
+	"""
+	with open(filePath, "r") as fp:
+		for line in fp:
+			line = line[:-1]
+			if delim is not None:
+				line = line.split(delim)
+			if filt(line):
+				yield line
+def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = ","):
+	"""
+	file record generator with  row and column filter applied
+	Parameters
+		filePath ; file path
+		filt : row filter
+		columns : column indexes as int array or coma separated string
+		delim : delemeter
+	"""
+	columns = strToIntArray(columns, delim)
+	with open(filePath, "r") as fp:
+		for line in fp:
+			line = line[:-1]
+			if delim is not None:
+				line = line.split(delim)
+			if filt(line):
+				selected = extractList(line, columns)
+				yield selected
+def fileTypedRecGen(filePath, ftypes, delim = ","):
+	"""
+	file typed record generator
+	Parameters
+		filePath ; file path
+		ftypes : list of field types
+		delim : delemeter
+	"""
+	with open(filePath, "r") as fp:
+		for line in fp:
+			line = line[:-1]
+			line = line.split(delim)
+			for i in range(0, len(ftypes), 2):
+				ci = ftypes[i]
+				dtype = ftypes[i+1]
+				assertLesser(ci, len(line), "index out of bound")
+				if dtype == "int":
+					line[ci] = int(line[ci])
+				elif dtype == "float":
+					line[ci] = float(line[ci])
+				else:
+					exitWithMsg("invalid data type")
+			yield line
+def fileMutatedFieldsRecGen(dirPath, mutator, delim=","):
+	"""
+	file record generator with some columns mutated
+	Parameters
+		dirPath ; file path
+		mutator : row field mutator
+		delim : delemeter
+	"""
+	for rec in fileRecGen(dirPath, delim):
+		mutated = mutator(rec)
+		yield mutated
+def tableSelFieldsFilter(tdata, columns):
+	"""
+	gets tabular data for selected columns
+	Parameters
+		tdata : tabular data
+		columns : column indexes
+	"""
+	if areAllFieldsIncluded(tdata[0], columns):
+		ntdata = tdata
+	else:
+		ntdata = list()
+		for rec in tdata:
+			#print(rec)
+			#print(columns)
+			nrec = extractList(rec, columns)
+			ntdata.append(nrec)
+	return ntdata
+def areAllFieldsIncluded(ldata, columns):
+	"""
+	return True id all indexes are in the columns
+	Parameters
+		ldata : list data
+		columns : column indexes
+	"""
+	return list(range(len(ldata))) == columns
+def asIntList(items):
+	"""
+	returns int list
+	Parameters
+		items : list data
+	"""
+	return [int(i) for i in items]
+def asFloatList(items):
+	"""
+	returns float list
+	Parameters
+		items : list data
+	"""
+	return [float(i) for i in items]
+def pastTime(interval, unit):
+	"""
+	current and past time
+	Parameters
+		interval : time interval
+		unit: time unit
+	"""
+	curTime = int(time.time())
+	if unit == "d":
+		pastTime = curTime - interval * secInDay
+	elif unit == "h":
+		pastTime = curTime - interval * secInHour
+	elif unit == "m":
+		pastTime = curTime - interval * secInMinute
+	else:
+		raise ValueError("invalid time unit " + unit)
+	return (curTime, pastTime)
+def minuteAlign(ts):
+	"""
+	minute aligned time
+	Parameters
+		ts : time stamp in sec
+	"""
+	return int((ts / secInMinute)) * secInMinute
+def multMinuteAlign(ts, min):
+	"""
+	multi minute aligned time
+	Parameters
+		ts : time stamp in sec
+		min : minute value
+	"""
+	intv = secInMinute * min
+	return int((ts / intv)) * intv
+def hourAlign(ts):
+	"""
+	hour aligned time
+	Parameters
+		ts : time stamp in sec
+	"""
+	return int((ts / secInHour)) * secInHour
+def hourOfDayAlign(ts, hour):
+	"""
+	hour of day aligned time
+	Parameters
+		ts : time stamp in sec
+		hour : hour of day
+	"""
+	day = int(ts / secInDay)
+	return (24 * day + hour) * secInHour
+def dayAlign(ts):
+	"""
+	day aligned time
+	Parameters
+		ts : time stamp in sec
+	"""
+	return int(ts / secInDay) * secInDay
+def timeAlign(ts, unit):
+	"""
+	boundary alignment of time
+	Parameters
+		ts : time stamp in sec
+		unit : unit of time
+	"""
+	alignedTs = 0
+	if unit == "s":
+		alignedTs = ts
+	elif unit == "m":
+		alignedTs = minuteAlign(ts)
+	elif unit == "h":
+		alignedTs = hourAlign(ts)
+	elif unit == "d":
+		alignedTs = dayAlign(ts)
+	else:
+		raise ValueError("invalid time unit")
+	return 	alignedTs
+def monthOfYear(ts):
+	"""
+	month of year
+	Parameters
+		ts : time stamp in sec
+	"""
+	rem = ts % secInYear
+	dow = int(rem / secInMonth)
+	return dow
+def dayOfWeek(ts):
+	"""
+	day of week
+	Parameters
+		ts : time stamp in sec
+	"""
+	rem = ts % secInWeek
+	dow = int(rem / secInDay)
+	return dow
+def hourOfDay(ts):
+	"""
+	hour of day
+	Parameters
+		ts : time stamp in sec
+	"""
+	rem = ts % secInDay
+	hod = int(rem / secInHour)
+	return hod
+def processCmdLineArgs(expectedTypes, usage):
+	"""
+	process command line args and returns args as typed values
+	Parameters
+		expectedTypes : expected data types of arguments
+		usage : usage message string
+	"""
+	args = []
+	numComLineArgs = len(sys.argv)
+	numExpected = len(expectedTypes)
+	if (numComLineArgs - 1 == len(expectedTypes)):
+		try:
+			for i in range(0, numExpected):
+				if (expectedTypes[i] == typeInt):
+					args.append(int(sys.argv[i+1]))
+				elif (expectedTypes[i] == typeFloat):
+					args.append(float(sys.argv[i+1]))
+				elif (expectedTypes[i] == typeString):
+					args.append(sys.argv[i+1])
+		except ValueError:
+			print ("expected number of command line arguments found but there is type mis match")
+			sys.exit(1)
+	else:
+		print ("expected number of command line arguments not found")
+		print (usage)
+		sys.exit(1)
+	return args
+def mutateString(val, numMutate, ctype):
+	"""
+	mutate string multiple times
+	Parameters
+		val : string value
+		numMutate : num of mutations
+		ctype : type of character to mutate with
+	"""
+	mutations = set()
+	count = 0
+	while count < numMutate:
+		j = randint(0, len(val)-1)
+		if j not in mutations:
+			if ctype == "alpha":
+				ch = selectRandomFromList(alphaTokens)
+			elif ctype == "num":
+				ch = selectRandomFromList(numTokens)
+			elif ctype == "any":
+				ch = selectRandomFromList(tokens)
+			val = val[:j] + ch + val[j+1:]
+			mutations.add(j)
+			count += 1
+	return val
+def mutateList(values, numMutate, vmin, vmax, rabs=True):
+	"""
+	mutate list multiple times
+	Parameters
+		values : list value
+		numMutate : num of mutations
+		vmin : minimum of value range
+		vmax : maximum of value range
+		rabs : True if mim max range is absolute otherwise relative
+	"""
+	mutations = set()
+	count = 0
+	while count < numMutate:
+		j = randint(0, len(values)-1)
+		if j not in mutations:
+			s = np.random.uniform(vmin, vmax)
+			values[j] = s if rabs else  values[j] * s
+			count += 1
+			mutations.add(j)
+	return values
+def swap(values, first, second):
+	"""
+	swap two elements
+	Parameters
+		values : list value
+		first : first swap position
+		second : second swap position
+	"""
+	t = values[first]
+	values[first] = values[second]
+	values[second] = t
+def swapBetweenLists(values1, values2):
+	"""
+	swap two elements between 2 lists
+	Parameters
+		values1 : first list of values
+		values2 : second list of values
+	"""
+	p1 = randint(0, len(values1)-1)
+	p2 = randint(0, len(values2)-1)
+	tmp = values1[p1]
+	values1[p1] = values2[p2]
+	values2[p2] = tmp
+def safeAppend(values, value):
+	"""
+	append only if not None
+	Parameters
+		values : list value
+		value : value to append
+	"""
+	if value is not None:
+		values.append(value)
+def getAllIndex(ldata, fldata):
+	"""
+	get ALL indexes of list elements
+	Parameters
+		ldata : list data to find index in
+		fldata : list data for values for index look up
+	"""
+	return list(map(lambda e : fldata.index(e), ldata))
+def findIntersection(lOne, lTwo):
+	"""
+	find intersection elements between 2 lists
+	Parameters
+		lOne : first list of data
+		lTwo : second list of data
+	"""
+	sOne = set(lOne)
+	sTwo = set(lTwo)
+	sInt = sOne.intersection(sTwo)
+	return list(sInt)
+def isIntvOverlapped(rOne, rTwo):
+	"""
+	checks overlap between 2 intervals
+	Parameters
+		rOne : first interval boundaries
+		rTwo : second interval boundaries
+	"""
+	clear = rOne[1] <=  rTwo[0] or rOne[0] >=  rTwo[1]
+	return not clear
+def isIntvLess(rOne, rTwo):
+	"""
+	checks if first iterval is less than second
+	Parameters
+		rOne : first interval boundaries
+		rTwo : second interval boundaries
+	"""
+	less = rOne[1] <=  rTwo[0]
+	return less
+def findRank(e, values):
+	"""
+	find rank of value in a list
+	Parameters
+		e : value to compare with
+		values : list data
+	"""
+	count =  1
+	for ve in values:
+		if ve < e:
+			count += 1
+	return count
+def findRanks(toBeRanked, values):
+	"""
+	find ranks of values in one list in another list
+	Parameters
+		toBeRanked : list of values for which ranks are found
+		values : list in which rank is found :
+	"""
+	return list(map(lambda e: findRank(e, values), toBeRanked))
+def formatFloat(prec, value, label = None):
+	"""
+	formats a float with optional label
+	Parameters
+		prec : precision
+		value : data value
+		label : label for data
+	"""
+	st = (label + " ") if label else ""
+	formatter = "{:." + str(prec) + "f}"
+	return st + formatter.format(value)
+def formatAny(value, label = None):
+	"""
+	formats any obkect with optional label
+	Parameters
+		value : data value
+		label : label for data
+	"""
+	st = (label + " ") if label else ""
+	return st + str(value)
+def printList(values):
+	"""
+	pretty print list
+	Parameters
+		values : list of values
+	"""
+	for v in values:
+		print(v)
+def printMap(values, klab, vlab, precision, offset=16):
+	"""
+	pretty print hash map
+	Parameters
+		values : dictionary of values
+		klab : label for key
+		vlab : label for value
+		precision : precision
+		offset : left justify offset
+	"""
+	print(klab.ljust(offset, " ") + vlab)
+	for k in values.keys():
+		v = values[k]
+		ks = toStr(k, precision).ljust(offset, " ")
+		vs = toStr(v, precision)
+		print(ks +  vs)
+def printPairList(values, lab1, lab2, precision, offset=16):
+	"""
+	pretty print list of pairs
+	Parameters
+		values : dictionary of values
+		lab1 : first label
+		lab2 : second label
+		precision : precision
+		offset : left justify offset
+	"""
+	print(lab1.ljust(offset, " ") + lab2)
+	for (v1, v2) in values:
+		sv1 = toStr(v1, precision).ljust(offset, " ")
+		sv2 = toStr(v2, precision)
+		print(sv1 + sv2)
+def createMap(*values):
+	"""
+	create disctionary with results
+	Parameters
+		values : sequence of key value pairs
+	"""
+	result = dict()
+	for i in range(0, len(values), 2):
+		result[values[i]] = values[i+1]
+	return result
+def getColMinMax(table, col):
+	"""
+	return min, max values of a column
+	Parameters
+		table : tabular data
+		col : column index
+	"""
+	vmin = None
+	vmax = None
+	for rec in table:
+		value = rec[col]
+		if vmin is None:
+			vmin = value
+			vmax = value
+		else:
+			if value < vmin:
+				vmin = value
+			elif value > vmax:
+				vmax = value
+	return (vmin, vmax, vmax - vmin)
+def createLogger(name, logFilePath, logLevName):
+	"""
+	creates logger
+	Parameters
+		name : logger name
+		logFilePath : log file path
+		logLevName : log level
+	"""
+	logger = logging.getLogger(name)
+	fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)
+	logLev = logLevName.lower()
+	if logLev == "debug":
+		logLevel = logging.DEBUG
+	elif logLev == "info":
+		logLevel = logging.INFO
+	elif logLev == "warning":
+		logLevel = logging.WARNING
+	elif logLev == "error":
+		logLevel = logging.ERROR
+	elif logLev == "critical":
+		logLevel = logging.CRITICAL
+	else:
+		raise ValueError("invalid log level name " + logLevelName)
+	fHandler.setLevel(logLevel)
+	fFormat = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+	fHandler.setFormatter(fFormat)
+	logger.addHandler(fHandler)
+	logger.setLevel(logLevel)
+	return logger
+@contextmanager
+def suppressStdout():
+	"""
+	suppress stdout
+	Parameters
+	"""
+	with open(os.devnull, "w") as devnull:
+		oldStdout = sys.stdout
+		sys.stdout = devnull
+		try:
+			yield
+		finally:
+			sys.stdout = oldStdout
+def exitWithMsg(msg):
+	"""
+	print message and exit
+	Parameters
+		msg : message
+	"""
+	print(msg + " -- quitting")
+	sys.exit(0)
+def drawLine(data, yscale=None):
+	"""
+	line plot
+	Parameters
+		data : list data
+		yscale : y axis scale
+	"""
+	plt.plot(data)
+	if yscale:
+		step = int(yscale / 10)
+		step = int(step / 10) * 10
+		plt.yticks(range(0, yscale, step))
+	plt.show()
+def drawPlot(x, y, xlabel, ylabel):
+	"""
+	line plot
+	Parameters
+		x : x values
+		y : y values
+		xlabel : x axis label
+		ylabel : y axis label
+	"""
+	if x is None:
+		x = list(range(len(y)))
+	plt.plot(x,y)
+	plt.xlabel(xlabel)
+	plt.ylabel(ylabel)
+	plt.show()
+def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):
+	"""
+	line plot of 2 lines
+	Parameters
+		x : x values
+		y1 : first y values
+		y2 : second y values
+		xlabel : x labbel
+		ylabel : y label
+		y1label : first plot label
+		y2label : second plot label
+	"""
+	plt.plot(x, y1, label = y1label)
+	plt.plot(x, y2, label = y2label)
+	plt.xlabel(xlabel)
+	plt.ylabel(ylabel)
+	plt.legend()
+	plt.show()
+def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):
+	"""
+	draw histogram
+	Parameters
+		ldata : list data
+		myTitle : title
+		myXlabel : x label
+		myYlabel : y label
+		nbins : num of bins
+	"""
+	plt.hist(ldata, bins=nbins, density=True)
+	plt.title(myTitle)
+	plt.xlabel(myXlabel)
+	plt.ylabel(myYlabel)
+	plt.show()
+def saveObject(obj, filePath):
+	"""
+	saves an object
+	Parameters
+		obj : object
+		filePath : file path for saved object
+	"""
+	with open(filePath, "wb") as outfile:
+		pickle.dump(obj,outfile)
+def restoreObject(filePath):
+	"""
+	restores an object
+	Parameters
+		filePath : file path to restore object from
+	"""
+	with open(filePath, "rb") as infile:
+		obj = pickle.load(infile)
+	return obj
+def isNumeric(data):
+	"""
+	true if all elements int or float
+	Parameters
+		data : numeric data list
+	"""
+	if type(data) == list or type(data) == np.ndarray:
+		col = pd.Series(data)
+	else:
+		col = data
+	return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64
+def isInteger(data):
+	"""
+	true if all elements int
+	Parameters
+		data : numeric data list
+	"""
+	if type(data) == list or type(data) == np.ndarray:
+		col = pd.Series(data)
+	else:
+		col = data
+	return col.dtype == np.int32 or col.dtype == np.int64
+def isFloat(data):
+	"""
+	true if all elements  float
+	Parameters
+		data : numeric data list
+	"""
+	if type(data) == list or type(data) == np.ndarray:
+		col = pd.Series(data)
+	else:
+		col = data
+	return col.dtype == np.float32 or col.dtype == np.float64
+def isBinary(data):
+	"""
+	true if all elements either 0 or 1
+	Parameters
+		data : binary data
+	"""
+	re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)
+	return (re is None)
+def isCategorical(data):
+	"""
+	true if all elements int or string
+	Parameters
+		data : data value
+	"""
+	re = next((d for d in data if not (type(d) == int or type(d) == str)), None)
+	return (re is None)
+def assertEqual(value, veq, msg):
+	"""
+	assert equal to
+	Parameters
+		value : value
+		veq : value to be equated with
+		msg : error msg
+	"""
+	assert value == veq , msg
+def assertGreater(value, vmin, msg):
+	"""
+	assert greater than
+	Parameters
+		value : value
+		vmin : minimum value
+		msg : error msg
+	"""
+	assert value > vmin , msg
+def assertGreaterEqual(value, vmin, msg):
+	"""
+	assert greater than
+	Parameters
+		value : value
+		vmin : minimum value
+		msg : error msg
+	"""
+	assert value >= vmin , msg
+def assertLesser(value, vmax, msg):
+	"""
+	assert less than
+	Parameters
+		value : value
+		vmax : maximum value
+		msg : error msg
+	"""
+	assert value < vmax , msg
+def assertLesserEqual(value, vmax, msg):
+	"""
+	assert less than
+	Parameters
+		value : value
+		vmax : maximum value
+		msg : error msg
+	"""
+	assert value <= vmax , msg
+def assertWithinRange(value, vmin, vmax, msg):
+	"""
+	assert within range
+	Parameters
+		value : value
+		vmin : minimum value
+		vmax : maximum value
+		msg : error msg
+	"""
+	assert value >= vmin and value <= vmax, msg
+def assertInList(value, values, msg):
+	"""
+	assert contains in a list
+	Parameters
+		value ; balue to check for inclusion
+		values : list data
+		msg : error msg
+	"""
+	assert value in values, msg
+def maxListDist(l1, l2):
+	"""
+	maximum list element difference between 2 lists
+	Parameters
+		l1 : first list data
+		l2 : second list data
+	"""
+	dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))
+	return dist
+def fileLineCount(fPath):
+	"""
+	number of lines ina file
+	Parameters
+		fPath : file path
+	"""
+	with open(fPath) as f:
+		for i, li in enumerate(f):
+			pass
+	return (i + 1)
+def getAlphaNumCharCount(sdata):
+	"""
+	number of alphabetic and numeric charcters in a string
+	Parameters
+		sdata : string data
+	"""
+	acount = 0
+	ncount = 0
+	scount = 0
+	ocount = 0
+	assertEqual(type(sdata), str, "input must be string")
+	for c in sdata:
+		if c.isnumeric():
+			ncount += 1
+		elif c.isalpha():
+			acount += 1
+		elif c.isspace():
+			scount += 1
+		else:
+			ocount += 1
+	r = (acount, ncount, ocount)
+	return r
+def genPowerSet(cvalues, incEmpty=False):
+	"""
+	generates power set i.e all possible subsets
+	Parameters
+		cvalues : list of categorical values
+		incEmpty : include empty set if True
+	"""
+	ps = list()
+	for cv in cvalues:
+		pse = list()
+		for s in ps:
+			sc = s.copy()
+			sc.add(cv)
+			#print(sc)
+			pse.append(sc)
+		ps.extend(pse)
+		es = set()
+		es.add(cv)
+		ps.append(es)
+		#print(es)
+	if incEmpty:
+		ps.append({})
+	return ps
+class StepFunction:
+	"""
+	step function
+	Parameters
+	"""
+	def __init__(self,  *values):
+		"""
+		initilizer
+		Parameters
+			values : list of tuples, wich each tuple containing 2 x values and corresponding y value
+		"""
+		self.points = values
+	def find(self, x):
+		"""
+		finds step function value
+		Parameters
+			x : x value
+		"""
+		found = False
+		y = 0
+		for p in self.points:
+			if (x >= p[0] and x < p[1]):
+				y = p[2]
+				found = True
+				break
+		if not found:
+			l = len(self.points)
+			if (x < self.points[0][0]):
+				y = self.points[0][2]
+			elif (x > self.points[l-1][1]):
+				y = self.points[l-1][2]
+		return y
+class DummyVarGenerator:
+	"""
+	dummy variable generator for categorical variable
+	"""
+	def __init__(self,  rowSize, catValues, trueVal, falseVal, delim=None):
+		"""
+		initilizer
+		Parameters
+			rowSize : row size
+			catValues : dictionary with field index as key and list of categorical values as value
+			trueVal : true value, typically "1"
+			falseval : false value , typically "0"
+			delim : field delemeter
+		"""
+		self.rowSize = rowSize
+		self.catValues = catValues
+		numCatVar = len(catValues)
+		colCount = 0
+		for v in self.catValues.values():
+			colCount += len(v)
+		self.newRowSize = rowSize - numCatVar + colCount
+		#print ("new row size {}".format(self.newRowSize))
+		self.trueVal = trueVal
+		self.falseVal = falseVal
+		self.delim = delim
+	def processRow(self, row):
+		"""
+		encodes categorical variables, returning as delemeter separate dstring or list
+		Parameters
+			row : row either delemeter separated string or list
+		"""
+		if self.delim is not None:
+			rowArr = row.split(self.delim)
+			msg = "row does not have expected number of columns found " + str(len(rowArr)) + " expected " + str(self.rowSize)
+			assert len(rowArr) == self.rowSize, msg
+		else:
+			rowArr = row
+		newRowArr = []
+		for i in range(len(rowArr)):
+			curVal = rowArr[i]
+			if (i in self.catValues):
+				values = self.catValues[i]
+				for val in values:
+					if val == curVal:
+						newVal = self.trueVal
+					else:
+						newVal = self.falseVal
+					newRowArr.append(newVal)
+			else:
+				newRowArr.append(curVal)
+		assert len(newRowArr) == self.newRowSize, "invalid new row size " + str(len(newRowArr)) + " expected " + str(self.newRowSize)
+		encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr
+		return encRow