Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

App Files Files Community

Priyanka-Kumavat-At-TE commited on Apr 18, 2023

Commit

8c99283

•

1 Parent(s): 59ade4b

Delete matumizi

Browse files

Files changed (20) hide show

matumizi/LICENSE +0 -202
matumizi/MANIFEST.in +0 -0
matumizi/README.md +0 -98
matumizi/config/mcamp.properties +0 -10
matumizi/docs/info_theory_based_feat_sel_tutorial.txt +0 -36
matumizi/docs/stock_portfolio_balancing_with_mc_simulation_tutorial.txt +0 -59
matumizi/examples/fesel.py +0 -264
matumizi/examples/mcamp.py +0 -50
matumizi/examples/pobal.py +0 -193
matumizi/matumizi/__init__.py +0 -0
matumizi/matumizi/daexp.py +0 -3121
matumizi/matumizi/mcsim.py +0 -552
matumizi/matumizi/mlutil.py +0 -1500
matumizi/matumizi/sampler.py +0 -1455
matumizi/matumizi/stats.py +0 -496
matumizi/matumizi/util.py +0 -2345
matumizi/pyproject.toml +0 -6
matumizi/requirements.txt +0 -9
matumizi/resources/spdata.txt +0 -12
matumizi/setup.cfg +0 -18

matumizi/LICENSE DELETED Viewed

@@ -1,202 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

matumizi/MANIFEST.in DELETED Viewed

File without changes

matumizi/README.md DELETED Viewed

@@ -1,98 +0,0 @@
-# matumizi
-Data Science utilities including following modules
-* util : misc utility functions
-* mlutil : machine learning related unitilies including a type aware confifiguration class
-* stats : various stats classes and functions
-* sampler : sampling from various statu=istical distributions
-* daexp : many data exploration functions consoloidating numpy, scipy, statsmodel and scikit
-* mcsim : monte carlo simulation
-## Instructions
-1. Install:
-Run
-pip3 install -i https://test.pypi.org/simple/ matumizi==0.0.7
-For installing latest, clone rep and run this at the project root directory
-pip3 install .
-2. Project page in testpypi
-https://test.pypi.org/project/matumizi/0.0.7/
-3. Blogs posts
-* [Data exploration module overview including usage examples](https://pkghosh.wordpress.com/2020/07/13/learn-about-your-data-with-about-seventy-data-exploration-functions-all-in-one-python-class/)
-* [Monte Carlo simulation for project cost estimation](https://pkghosh.wordpress.com/2020/05/11/monte-carlo-simulation-library-in-python-with-project-cost-estimation-as-an-example/)
-* [Information theory based feature selection](https://pkghosh.wordpress.com/2022/05/29/feature-selection-with-information-theory-based-techniques-in-python/)
-* [Stock Portfolio Balancing with Monte Carlo Simulation](https://pkghosh.wordpress.com/2022/08/23/stock-portfolio-balancing-with-monte-carlo-simulation/)
-* [Synthetic Regression Data Generation in Python](https://pkghosh.wordpress.com/2023/01/22/synthetic-regression-data-generation-in-python/)
-4. Code usage example
-Here is some example code that uses all 5 modules. You can find lots of examples in
-[another repo](https://github.com/pranab/avenir/tree/master/python/app) of mine. There the
-imports are direct and not through the package matmizi. The example directory also has example code
-	import sys
-	import math
-	from matumizi import util as ut
-	from matumizi import mlutil as ml
-	from matumizi import sampler as sa
-	from matumizi import stats as st
-	from matumizi import daexp as de
-	#generate some random strings
-	ldata = ut.genIdList(10, 6)
-	print("random strings")
-	print(ldata)
-	#select random sublist from a list
-	sldata = ut.selectRandomSubListFromList(ldata, 4)
-	print("nselected random strings)")
-	print(sldata)
-	random walk
-	print("\nrandom walk")
-	for pos in ml.randomWalk(20, 10, -2, 2):
-		print(pos)
-	#sample from non parametric sampler
-	print("\nsampling from a non parametric sampler")
-	sampler = sa.NonParamRejectSampler(10, 4, 1, 4, 8, 16, 14, 12, 8, 4, 2)
-	for _ in range(8):
-	d = sampler.sample()
-		print(ut.formatFloat(3, d))
-	#statistics from asliding window
-	print("\nstats from sliding window")
-	wsize = 30
-	win = st.SlidingWindowStat.createEmpty(wsize)
-	mean = 10
-	sd = 2
-	ns = sa.NormalSampler(mean, sd)
-	for _ in range(40):
-		#gaussian with some noise
-		d = ns.sample() + sa.randomFloat(-1, 1)
-		win.add(d)
-	re = win.getStat()
-	print(re)
-	#get time series components
-	print("\ntime series components")
-	expl = de.DataExplorer(False)
-	mean = 100
-	sd = 5
-	period = 7
-	trdelta = .1
-	cycle = list(map(lambda v : 10 * math.sin(2 * math.pi * v / period), range(period)))
-	sampler = sa.NormalSamplerWithTrendCycle(mean, sd, trdelta, cycle)
-	ldata = list(map(lambda i : sampler.sample(), range(200)))
-	expl.addListNumericData(ldata, "test")
-	re = expl.getTimeSeriesComponents("test", "additive", period, True)
-	print(re)

matumizi/config/mcamp.properties DELETED Viewed

@@ -1,10 +0,0 @@
-common.pvar.samplers=1:3:1:30:50:20:discrete:int,100:20:normal:float,1:8:1:10:20:50:70:85:100:60:30:discrete:int,1:7:1:60:40:30:50:70:95:120:discrete:int,0.5:0:1:bernauli:int
-common.pvar.ranges=1,3,30,200,1,8,1,7,0,1
-common.linear.weights=1.2,1.4,1.0,1.2,1.5
-common.square.weights=1,0.15
-common.crterm.weights=2,3,0.1
-common.corr.params=0:1:40.0:30.0:.08:false
-common.bias=20
-common.noise=normal,.05
-common.tvar.range=50,300
-common.weight.niter=200

matumizi/docs/info_theory_based_feat_sel_tutorial.txt DELETED Viewed

@@ -1,36 +0,0 @@
-This tutorial is for information theory based feature selection  a loan application data set. The
-implementation is the python package matumizi
-Setup
-=====
-Install matumizi as follows
-pip3 install -i https://test.pypi.org/simple/ matumizi==0.0.5
-Install requirements
-pip3 install -r requirements.txt
-Generate loan application data
-==============================
-python3 fesel.py --op gen --nloan 2000 --noise .05 --klen 10 > lo.txt
-where
-op = operation to perform
-nloan = num of loans
-noise = noise level
-klen = loan ID length
-Options for "op" (featute selection techniques)
-mrmr 	- Max relevance min redundancy
-jmi	- Joint mutual information
-cmim	- Conditional mutual information maximization
-icap 	- Interaction capping
-infg	- Information gain
-Feature selection
-=================
-python3 fesel.py --op fsel --fpath lo.txt --algo mrmr
-where
-op = operation to perform
-fpath = path to file containing loan data
-algo = feature selection algorithm (mrmr, jmi, cmim, icap)

matumizi/docs/stock_portfolio_balancing_with_mc_simulation_tutorial.txt DELETED Viewed

@@ -1,59 +0,0 @@
-This tutorial is  for financial protfolio balancing with Monte Carlo simulation and Sharpe Ration
-Setup
-=====
-Install matumizi which is a package for data exploration and various other utilities
-pip3 install -i https://test.pypi.org/simple/ matumizi==0.0.3
-Portfolio data
-==============
-Decide what stocks to have in the portfolio  and create a portfolio data file, with one row
-per stock, with each row as below containing 3 fields
-stock_symbol,num_stocks,value_at-beginning_of_time_window
-Stock historical data
-=====================
-Choose a time window (e.g. 6 months) and download historical stock data for all the stocks in the portfolio
-from this web site
-https://www.nasdaq.com/market-activity/quotes/historical
-Store all files in the directory specified by the command line arg "sdfpath". Change each file name so that
-file name begins as "SS_" where SS is a stock symbol
-Run simulator
-=============
-python3 pobal.py --op simu --niter 100 --sdfpath ./sdata --spdpath spdata.txt --exfac 0.9 --rfret 0.01
-niter = Num of iterations
-sdfpath = Path of directory containing stock data files. The filenames should start with <SS>_ where SS
-    is the stock symbol
-spdpath = Path of file containg current holding. each row is coma separated 3 fields stock symbol,
-nium of stocks and the  value at the beginning of historic data time window (spdata.txt in the resource directory)
-exfac = Factor exponential forecast of stock price
-rfret = Risk free investement return in the time window
-Command line argument values are example. Change them as needed
-Output
-======
-The output end will look as below
-best score 8.839
-weights  [0.10270294837929556, 0.11041322597243025, 0.000652404909398755, 0.11668341692081166, 0.018728111576860603, 0.12688306074193234, 0.016674345483451796, 0.1310681987561672, 0.020349302455518792, 0.15131254832113178, 0.07228010995988338, 0.13225232652311789]
-buy and sell recommendations
-('WMT', 27)
-('PFE', 358)
-('NFLX', -212)
-('AMD', 93)
-('TSLA', -58)
-('AMZN', 155)
-('META', -120)
-('QCOM', 129)
-('CSCO', -17)
-('MSFT', 73)
-('SBUX', 62)
-('AAPL', 129)

matumizi/examples/fesel.py DELETED Viewed

@@ -1,264 +0,0 @@
-#!/usr/local/bin/python3
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import random
-import statistics
-import matplotlib.pyplot as plt
-import argparse
-from matumizi.util import *
-from matumizi.mlutil import *
-from matumizi.daexp import *
-from matumizi.sampler import *
-NFEAT = 11
-NFEAT_EXT = 14
-class LoanApprove:
-	def __init__(self, numLoans=None):
-		self.numLoans = numLoans
-		self.marStatus = ["married", "single", "divorced"]
-		self.loanTerm = ["7", "15", "30"]
-		self.addExtra = False
-	def initTwo(self):
-		"""
-		initialize samplers
-		"""
-		self.approvDistr = CategoricalRejectSampler(("1", 60), ("0", 40))
-		self.featCondDister = {}
-		#marital status
-		key = ("1", 0)
-		distr = CategoricalRejectSampler(("married", 100), ("single", 60), ("divorced", 40))
-		self.featCondDister[key] = distr
-		key = ("0", 0)
-		distr = CategoricalRejectSampler(("married", 40), ("single", 100), ("divorced", 40))
-		self.featCondDister[key] = distr
-		# num of children
-		key = ("1", 1)
-		distr = CategoricalRejectSampler(("1", 100), ("2", 90), ("3", 40))
-		self.featCondDister[key] = distr
-		key = ("0", 1)
-		distr = CategoricalRejectSampler(("1", 50), ("2", 70), ("3", 100))
-		self.featCondDister[key] = distr
-		# education
-		key = ("1", 2)
-		distr = CategoricalRejectSampler(("1", 30), ("2", 80), ("3", 100))
-		self.featCondDister[key] = distr
-		key = ("0", 2)
-		distr = CategoricalRejectSampler(("1", 100), ("2", 40), ("3", 30))
-		self.featCondDister[key] = distr
-		#self employed
-		key = ("1", 3)
-		distr = CategoricalRejectSampler(("1", 40), ("0", 100))
-		self.featCondDister[key] = distr
-		key = ("0", 3)
-		distr = CategoricalRejectSampler(("1", 100), ("0", 30))
-		self.featCondDister[key] = distr
-		# income
-		key = ("1", 4)
-		distr = GaussianRejectSampler(120,15)
-		self.featCondDister[key] = distr
-		key = ("0", 4)
-		distr = GaussianRejectSampler(50,10)
-		self.featCondDister[key] = distr
-		# years of experience
-		key = ("1", 5)
-		distr = GaussianRejectSampler(15,3)
-		self.featCondDister[key] = distr
-		key = ("0", 5)
-		distr = GaussianRejectSampler(5,1)
-		self.featCondDister[key] = distr
-		# number of years in current job
-		key = ("1", 6)
-		distr = GaussianRejectSampler(3,.5)
-		self.featCondDister[key] = distr
-		key = ("0", 6)
-		distr = GaussianRejectSampler(1,.2)
-		self.featCondDister[key] = distr
-		# outstanding debt
-		key = ("1", 7)
-		distr = GaussianRejectSampler(20,5)
-		self.featCondDister[key] = distr
-		key = ("0", 7)
-		distr = GaussianRejectSampler(60,10)
-		self.featCondDister[key] = distr
-		# loan amount
-		key = ("1", 8)
-		distr = GaussianRejectSampler(300,50)
-		self.featCondDister[key] = distr
-		key = ("0", 8)
-		distr = GaussianRejectSampler(600,50)
-		self.featCondDister[key] = distr
-		# loan term
-		key = ("1", 9)
-		distr = CategoricalRejectSampler(("7", 100), ("15", 40), ("30", 60))
-		self.featCondDister[key] = distr
-		key = ("0", 9)
-		distr = CategoricalRejectSampler(("7", 30), ("15", 100), ("30", 60))
-		self.featCondDister[key] = distr
-		# credit score
-		key = ("1", 10)
-		distr = GaussianRejectSampler(700,20)
-		self.featCondDister[key] = distr
-		key = ("0", 10)
-		distr = GaussianRejectSampler(500,50)
-		self.featCondDister[key] = distr
-		if self.addExtra:
-			# saving
-			key = ("1", 11)
-			distr = NormalSampler(80,10)
-			self.featCondDister[key] = distr
-			key = ("0", 11)
-			distr = NormalSampler(60,8)
-			self.featCondDister[key] = distr
-			# retirement
-			zDistr = NormalSampler(0, 0)
-			key = ("1", 12)
-			sDistr = DiscreteRejectSampler(0,1,1,20,80)
-			nzDistr = NormalSampler(100,20)
-			distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
-			self.featCondDister[key] = distr
-			key = ("0", 12)
-			sDistr = DiscreteRejectSampler(0,1,1,50,50)
-			nzDistr = NormalSampler(40,10)
-			distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
-			self.featCondDister[key] = distr
-			#num of prior mortgae loans
-			key = ("1", 13)
-			distr = DiscreteRejectSampler(0,3,1,20,60,40,15)
-			self.featCondDister[key] = distr
-			key = ("0", 13)
-			distr = DiscreteRejectSampler(0,1,1,70,30)
-			self.featCondDister[key] = distr
-	def generateTwo(self, noise, keyLen, addExtra):
-		"""
-		ancestral sampling
-		"""
-		self.addExtra = addExtra
-		self.initTwo()
-		#error
-		erDistr = GaussianRejectSampler(0, noise)
-		#sampler
-		numChildren = NFEAT_EXT if self.addExtra else NFEAT
-		sampler = AncestralSampler(self.approvDistr, self.featCondDister, numChildren)
-		for i in range(self.numLoans):
-			(claz, features) = sampler.sample()
-			# add noise
-			features[4] = int(features[4])
-			features[7] = int(features[7])
-			features[8] = int(features[8])
-			features[10] = int(features[10])
-			if self.addExtra:
-				features[11] = int(features[11])
-				features[12] = int(features[12])
-			claz = addNoiseCat(claz, ["0", "1"], noise)
-			strFeatures = list(map(lambda f: toStr(f, 2), features))
-			rec =  genID(keyLen) + "," + ",".join(strFeatures) + "," + claz
-			print (rec)
-	def encodeDummy(self, fileName, extra):
-		"""
-		dummy var encoding
-		"""
-		catVars = {}
-		catVars[1] = self.marStatus
-		catVars[10] = self.loanTerm
-		rSize = NFEAT_EXT if extra else NFEAT
-		rSize += 2
-		dummyVarGen = DummyVarGenerator(rSize, catVars, "1", "0", ",")
-		for row in fileRecGen(fileName, None):
-			newRow = dummyVarGen.processRow(row)
-			print (newRow)
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser()
-	parser.add_argument('--op', type=str, default = "none", help = "operation")
-	parser.add_argument('--nloan', type=int, default = 1000, help = "nom of loans")
-	parser.add_argument('--noise', type=float, default = 0.1, help = "nom of loans")
-	parser.add_argument('--klen', type=int, default = 1000, help = "key length")
-	parser.add_argument('--fpath', type=str, default = "none", help = "source file path")
-	parser.add_argument('--algo', type=str, default = "none", help = "source file path")
-	args = parser.parse_args()
-	op = args.op
-	if op == "gen":
-		"""  generate data """
-		numLoans = args.nloan
-		loan = LoanApprove(numLoans)
-		noise = args.noise
-		keyLen = args.klen
-		addExtra = True
-		loan.generateTwo(noise, keyLen, addExtra)
-	elif op == "encd":
-		""" encode binary """
-		fileName = args.fpath
-		extra = True
-		loan = LoanApprove()
-		loan.encodeDummy(fileName, extra)
-	elif op == "fsel":
-		""" feature select  """
-		fpath = args.fpath
-		algo = args.algo
-		expl = DataExplorer(False)
-		expl.addFileNumericData(fpath, 5, 8, 11, 12, "income", "debt", "crscore", "saving")
-		expl.addFileCatData(fpath, 3, 4, 15, "education", "selfemp", "target")
-		fdt = ["education", "cat", "selfemp", "cat", "income", "num",  "debt", "num", "crscore", "num"]
-		tdt = ["target", "cat"]
-		if args.algo == "mrmr":
-			res = expl.getMaxRelMinRedFeatures(fdt, tdt, 3)
-		elif args.algo == "jmi":
-			res = expl.getJointMutInfoFeatures(fdt, tdt, 3)
-		elif args.algo == "cmim":
-			res = expl.getCondMutInfoMaxFeatures(fdt, tdt, 3)
-		elif args.algo == "icap":
-			res = expl.getInteractCapFeatures(fdt, tdt, 3)
-		elif args.algo == "infg":
-			res = expl.getInfoGainFeatures(fdt, tdt, 3, 8)
-		print(res)
-	else:
-		exitWithMsg("invalid command")

matumizi/examples/mcamp.py DELETED Viewed

@@ -1,50 +0,0 @@
-#!/usr/local/bin/python3
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import random
-import statistics
-import matplotlib.pyplot as plt
-import argparse
-from matumizi.util import *
-from matumizi.mlutil import *
-from matumizi.daexp import *
-from matumizi.sampler import *
-"""
-AB test simulation with counterfactuals
-"""
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser()
-	parser.add_argument('--op', type=str, default = "none", help = "operation")
-	parser.add_argument('--genconf', type=str, default = "", help = "data gennerator config file")
-	parser.add_argument('--nsamp', type=int, default = 1000, help = "no of samples to generate")
-	args = parser.parse_args()
-	op = args.op
-	if op == "gen":
-		"""  generate data """
-		dgen = RegressionDataGenerator(args.genconf)
-		for _ in range(args.nsamp):
-			s = dgen.sample()
-			pv = toStrFromList(s[0], 2)
-			pv = pv + "," + toStr(s[1], 2)
-			print(pv)

matumizi/examples/pobal.py DELETED Viewed

@@ -1,193 +0,0 @@
-#!/usr/local/bin/python3
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import random
-import statistics
-import numpy as np
-import matplotlib.pyplot as plt
-import argparse
-from matumizi.util import *
-from matumizi.sampler import *
-from matumizi.mcsim import *
-"""
-Balances portfolio with Monte Carlo simulation and Sharpe ratio
-"""
-class PortFolio():
-	"""
-	portfolio
-	"""
-	def __init__(self):
-		"""
-		"""
-		self.stocks = list()
-		self.srets = list()
-		self.rcovar = None
-		self.nstock = None
-		self.weights = None
-		self.metric = -sys.float_info.max
-		self.rfret = None
-		self.spred = list()
-	def loadStData(self, sdfPath, exfac):
-		"""
-		load and process stock data
-		"""
-		e1 = 1 - exfac
-		e2 = e1 * e1
-		files = getAllFiles(sdfPath)
-		print(files)
-		returns = list()
-		for ss, qn, pp in self.stocks:
-			print("next stock ", ss)
-			for fp in files:
-				fname = os.path.basename(fp)
-				stname = fname.split("_")[0]
-				#print("stock nane from file name ", stname)
-				if stname == ss:
-					#daily prices
-					print("loading ", ss)
-					prices = getFileColumnAsString(fp, 1)
-					prices = prices [1:]
-					prices = list(map(lambda p : float(p[1:]), prices))
-					#predicted price and retuen
-					sppred = exfac * prices[0] + exfac * e1 * prices[1] + exfac * e2 * prices[2]
-					self.spred.append(sppred)
-					up = pp / qn
-					sret = (sppred - up) / up
-					r = (ss, sret)
-					self.srets.append(r)
-					#daily returns
-					bp = prices[-1]
-					sdret = list(map(lambda p : (p - bp) / bp, prices))
-					#print("daily return size ",  len(sdret))
-					returns.append(sdret)
-					break
-		returns = np.array(returns)
-		print("daily returns shape ",returns.shape)
-		self.rcovar = np.cov(returns)
-		print("covar shape ", self.rcovar.shape)
-	def optimize(self):
-		"""
-		balance i.e make buy, sell recommendations
-		"""
-		tamount = 0
-		amounts = list()
-		for ss, qn , pp in self.stocks:
-			amnt = pp
-			amounts.append(amnt)
-			tamount += amnt
-		namounts = list(map(lambda w : w * tamount, self.weights))
-		quantities = list()
-		for am, nam, ppr in zip(amounts, namounts, self.spred):
-			#no of stocks to buy or sell for each
-			tamount = nam - am
-			qnt = int(tamount / ppr)
-			quantities.append(qnt)
-		trans = list()
-		for s, q in zip(self.stocks, quantities):
-			tr = (s[0], q)
-			trans.append(tr)
-		return trans
-# portfolio object
-pfolio = PortFolio()
-def balance(args):
-	"""
-	callback for portfolio weights
-	"""
-	weights = args[:pfolio.nstock]
-	#print("wieights ", weights)
-	weights = scaleBySum(weights)
-	#print("scaled wieights ", weights)
-	#weighted return
-	wr = 0
-	for r, w in zip(pfolio.srets, weights):
-		wr += (r[1] - pfolio.rfret) * w
-	wrcv = 0
-	for i in range(pfolio.nstock):
-		for j in range(pfolio.nstock):
-			wrcv += pfolio.rcovar[i][j] * weights[i] * weights[j]
-	metric = wr / wrcv
-	print("score {:.3f}".format(metric))
-	if metric > pfolio.metric:
-		pfolio.metric = metric
-		pfolio.weights = weights
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser()
-	parser.add_argument('--op', type=str, default = "none", help = "operation")
-	parser.add_argument('--niter', type=int, default = "none", help = "num of iterations")
-	parser.add_argument('--sdfpath', type=str, default = "none", help = "stock data file directory path")
-	parser.add_argument('--spdpath', type=str, default = "none", help = "path of file containing purchase data")
-	parser.add_argument('--exfac', type=float, default = 0.9, help = "exponential factor for prediction")
-	parser.add_argument('--rfret', type=float, default = 0.2, help = "risk free return")
-	args = parser.parse_args()
-	op = args.op
-	if op == "simu":
-		tdata = getFileLines(args.spdpath)
-		for rec in tdata:
-			#stock symbol, quantity, purchase price
-			sname = rec[0]
-			quant = int(rec[1])
-			pcost = float(rec[2])
-			t = (sname, quant, pcost)
-			pfolio.stocks.append(t)
-		#create and run simulator
-		numIter = args.niter
-		lfp = "./log/mcsim.log"
-		simulator = MonteCarloSimulator(numIter, balance, lfp, "info")
-		nstock = len(pfolio.stocks)
-		for _ in range(nstock):
-			simulator.registerUniformSampler(0.0, 1.0)
-		pfolio.nstock = nstock
-		pfolio.rfret = args.rfret
-		pfolio.loadStData(args.sdfpath, args.exfac)
-		simulator.run()
-		print("best score {:.3f}".format(pfolio.metric))
-		print("weights ", pfolio.weights)
-		print("buy and sell recommendations")
-		trans = pfolio.optimize()
-		for tr in trans:
-			print(tr)

matumizi/matumizi/__init__.py DELETED Viewed

File without changes

matumizi/matumizi/daexp.py DELETED Viewed

@@ -1,3121 +0,0 @@
-#!/usr/local/bin/python3
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import numpy as np
-import pandas as pd
-import sklearn as sk
-from sklearn import preprocessing
-from sklearn import metrics
-import random
-from math import *
-from decimal import Decimal
-import pprint
-from statsmodels.graphics import tsaplots
-from statsmodels.tsa import stattools as stt
-from statsmodels.stats import stattools as sstt
-from sklearn.linear_model import LinearRegression
-from matplotlib import pyplot as plt
-from scipy import stats as sta
-from statsmodels.tsa.seasonal import seasonal_decompose
-import statsmodels.api as sm
-from sklearn.ensemble import IsolationForest
-from sklearn.neighbors import LocalOutlierFactor
-from sklearn.svm import OneClassSVM
-from sklearn.covariance import EllipticEnvelope
-from sklearn.mixture import GaussianMixture
-from sklearn.cluster import KMeans
-from sklearn.decomposition import PCA
-import hurst
-from .util import *
-from .mlutil import *
-from .sampler import *
-from .stats import *
-"""
-Load  data from a CSV file, data frame, numpy array or list
-Each data set (array like) is given a name while loading
-Perform various data exploration operation refering to the data sets by name
-Save and restore workspace if needed
-"""
-class DataSetMetaData:
-	"""
-	data set meta data
-	"""
-	dtypeNum = 1
-	dtypeCat = 2
-	dtypeBin = 3
-	def __init__(self, dtype):
-		self.notes = list()
-		self.dtype = dtype
-	def addNote(self, note):
-		"""
-		add note
-		"""
-		self.notes.append(note)
-class DataExplorer:
-	"""
-	various data exploration functions
-	"""
-	def __init__(self, verbose=True):
-		"""
-		initialize
-		Parameters
-			verbose : True for verbosity
-		"""
-		self.dataSets = dict()
-		self.metaData = dict()
-		self.pp = pprint.PrettyPrinter(indent=4)
-		self.verbose = verbose
-	def setVerbose(self, verbose):
-		"""
-		sets verbose
-		Parameters
-			verbose : True for verbosity
-		"""
-		self.verbose = verbose
-	def save(self, filePath):
-		"""
-		save checkpoint
-		Parameters
-			filePath : path of file where saved
-		"""
-		self.__printBanner("saving workspace")
-		ws = dict()
-		ws["data"] = self.dataSets
-		ws["metaData"] = self.metaData
-		saveObject(ws, filePath)
-		self.__printDone()
-	def restore(self, filePath):
-		"""
-		restore checkpoint
-		Parameters
-			filePath : path of file from where to store
-		"""
-		self.__printBanner("restoring workspace")
-		ws = restoreObject(filePath)
-		self.dataSets = ws["data"]
-		self.metaData = ws["metaData"]
-		self.__printDone()
-	def queryFileData(self, filePath,  *columns):
-		"""
-		query column data type  from a data file
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("querying column data type from a data frame")
-		lcolumns = list(columns)
-		noHeader = type(lcolumns[0]) ==  int
-		if noHeader:
-			df = pd.read_csv(filePath,  header=None)
-		else:
-			df = pd.read_csv(filePath,  header=0)
-		return self.queryDataFrameData(df,  *columns)
-	def queryDataFrameData(self, df,  *columns):
-		"""
-		query column data type  from a data frame
-		Parameters
-			df : data frame with data
-			columns : indexes followed by column name or column names
-		"""
-		self.__printBanner("querying column data type  from a data frame")
-		columns = list(columns)
-		noHeader = type(columns[0]) ==  int
-		dtypes = list()
-		if noHeader:
-			nCols = int(len(columns) / 2)
-			colIndexes = columns[:nCols]
-			cnames = columns[nCols:]
-			nColsDf = len(df.columns)
-			for i in range(nCols):
-				ci = colIndexes[i]
-				assert ci < nColsDf, "col index {} outside range".format(ci)
-				col = df.loc[ : , ci]
-				dtypes.append(self.getDataType(col))
-		else:
-			cnames = columns
-			for c in columns:
-				col = df[c]
-				dtypes.append(self.getDataType(col))
-		nt = list(zip(cnames, dtypes))
-		result = self.__printResult("columns and data types", nt)
-		return result
-	def getDataType(self, col):
-		"""
-		get data type
-		Parameters
-			col : contains data array like
-		"""
-		if isBinary(col):
-			dtype = "binary"
-		elif  isInteger(col):
-			dtype = "integer"
-		elif  isFloat(col):
-			dtype = "float"
-		elif  isCategorical(col):
-			dtype = "categorical"
-		else:
-			dtype = "mixed"
-		return dtype
-	def addFileNumericData(self,filePath,  *columns):
-		"""
-		add numeric columns from a file
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding numeric columns from a file")
-		self.addFileData(filePath, True, *columns)
-		self.__printDone()
-	def addFileBinaryData(self,filePath,  *columns):
-		"""
-		add binary columns from a file
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding binary columns from a file")
-		self.addFileData(filePath, False, *columns)
-		self.__printDone()
-	def addFileData(self, filePath,  numeric, *columns):
-		"""
-		add columns from a file
-		Parameters
-			filePath : path of file with data
-			numeric : True if numeric False in binary
-			columns : indexes followed by column names or column names
-		"""
-		columns = list(columns)
-		noHeader = type(columns[0]) ==  int
-		if noHeader:
-			df = pd.read_csv(filePath,  header=None)
-		else:
-			df = pd.read_csv(filePath,  header=0)
-		self.addDataFrameData(df, numeric, *columns)
-	def addDataFrameNumericData(self,filePath,  *columns):
-		"""
-		add numeric columns from a data frame
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding numeric columns from a data frame")
-		self.addDataFrameData(filePath, True, *columns)
-	def addDataFrameBinaryData(self,filePath,  *columns):
-		"""
-		add binary columns from a data frame
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding binary columns from a data frame")
-		self.addDataFrameData(filePath, False, *columns)
-	def addDataFrameData(self, df,  numeric, *columns):
-		"""
-		add columns from a data frame
-		Parameters
-			df : data frame with data
-			numeric : True if numeric False in binary
-			columns : indexes followed by column names or column names
-		"""
-		columns = list(columns)
-		noHeader = type(columns[0]) ==  int
-		if noHeader:
-			nCols = int(len(columns) / 2)
-			colIndexes = columns[:nCols]
-			nColsDf = len(df.columns)
-			for i in range(nCols):
-				ci = colIndexes[i]
-				assert ci < nColsDf, "col index {} outside range".format(ci)
-				col = df.loc[ : , ci]
-				if numeric:
-					assert isNumeric(col), "data is not numeric"
-				else:
-					assert isBinary(col), "data is not binary"
-				col = col.to_numpy()
-				cn = columns[i + nCols]
-				dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
-				self.__addDataSet(cn, col, dtype)
-		else:
-			for c in columns:
-				col = df[c]
-				if numeric:
-					assert isNumeric(col), "data is not numeric"
-				else:
-					assert isBinary(col), "data is not binary"
-				col = col.to_numpy()
-				dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
-				self.__addDataSet(c, col, dtype)
-	def __addDataSet(self, dsn, data, dtype):
-		"""
-		add dada set
-		Parameters
-			dsn: data set name
-			data : numpy array data
-		"""
-		self.dataSets[dsn] = data
-		self.metaData[dsn] = DataSetMetaData(dtype)
-	def addListNumericData(self, ds,  name):
-		"""
-		add numeric data from a list
-		Parameters
-			ds : list with data
-			name : name of data set
-		"""
-		self.__printBanner("add numeric data from a list")
-		self.addListData(ds, True,  name)
-		self.__printDone()
-	def addListBinaryData(self, ds, name):
-		"""
-		add binary data from a list
-		Parameters
-			ds : list with data
-			name : name of data set
-		"""
-		self.__printBanner("adding binary data from a list")
-		self.addListData(ds, False,  name)
-		self.__printDone()
-	def addListData(self, ds, numeric,  name):
-		"""
-		adds list data
-		Parameters
-			ds : list with data
-			numeric : True if numeric False in binary
-			name : name of data set
-		"""
-		assert type(ds) == list, "data not a list"
-		if numeric:
-			assert isNumeric(ds), "data is not numeric"
-		else:
-			assert isBinary(ds), "data is not binary"
-		dtype = DataSetMetaData.dtypeNum if numeric else DataSetMetaData.dtypeBin
-		self.dataSets[name] = np.array(ds)
-		self.metaData[name] = DataSetMetaData(dtype)
-	def addFileCatData(self, filePath,  *columns):
-		"""
-		add categorical columns from a file
-		Parameters
-			filePath : path of file with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding categorical columns from a file")
-		columns = list(columns)
-		noHeader = type(columns[0]) ==  int
-		if noHeader:
-			df = pd.read_csv(filePath,  header=None)
-		else:
-			df = pd.read_csv(filePath,  header=0)
-		self.addDataFrameCatData(df,  *columns)
-		self.__printDone()
-	def addDataFrameCatData(self, df,  *columns):
-		"""
-		add categorical columns from a data frame
-		Parameters
-			df : data frame with data
-			columns : indexes followed by column names or column names
-		"""
-		self.__printBanner("adding categorical columns from a data frame")
-		columns = list(columns)
-		noHeader = type(columns[0]) ==  int
-		if noHeader:
-			nCols = int(len(columns) / 2)
-			colIndexes = columns[:nCols]
-			nColsDf = len(df.columns)
-			for i in range(nCols):
-				ci = colIndexes[i]
-				assert ci < nColsDf, "col index {} outside range".format(ci)
-				col = df.loc[ : , ci]
-				assert isCategorical(col), "data is not categorical"
-				col = col.tolist()
-				cn = columns[i + nCols]
-				self.__addDataSet(cn, col, DataSetMetaData.dtypeCat)
-		else:
-			for c in columns:
-				col = df[c].tolist()
-				self.__addDataSet(c, col, DataSetMetaData.dtypeCat)
-	def addListCatData(self, ds, name):
-		"""
-		add categorical list data
-		Parameters
-			ds : list with data
-			name : name of data set
-		"""
-		self.__printBanner("adding categorical list data")
-		assert type(ds) == list, "data not a list"
-		assert isCategorical(ds), "data is not categorical"
-		self.__addDataSet(name, ds, DataSetMetaData.dtypeCat)
-		self.__printDone()
-	def remData(self, ds):
-		"""
-		removes data set
-		Parameters
-			ds : data set name
-		"""
-		self.__printBanner("removing data set", ds)
-		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-		self.dataSets.pop(ds)
-		self.metaData.pop(ds)
-		names = self.showNames()
-		self.__printDone()
-		return names
-	def addNote(self, ds, note):
-		"""
-		get data
-		Parameters
-			ds : data set name or list or numpy array with data
-			note: note text
-		"""
-		self.__printBanner("adding note")
-		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-		mdata = self.metaData[ds]
-		mdata.addNote(note)
-		self.__printDone()
-	def getNotes(self, ds):
-		"""
-		get data
-		Parameters
-			ds : data set name or list or numpy array with data
-		"""
-		self.__printBanner("getting notes")
-		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-		mdata = self.metaData[ds]
-		dnotes = mdata.notes
-		if self.verbose:
-			for dn in dnotes:
-				print(dn)
-		return dnotes
-	def getNumericData(self, ds):
-		"""
-		get numeric data
-		Parameters
-			ds : data set name or list or numpy array with data
-		"""
-		if type(ds) == str:
-			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-			assert self.metaData[ds].dtype == DataSetMetaData.dtypeNum, "data set {} is expected to be numerical type for this operation".format(ds)
-			data =   self.dataSets[ds]
-		elif type(ds) == list:
-			assert isNumeric(ds), "data is not numeric"
-			data = np.array(ds)
-		elif type(ds) == np.ndarray:
-			data = ds
-		else:
-			raise "invalid type, expecting data set name, list or ndarray"
-		return data
-	def getCatData(self, ds):
-		"""
-		get categorical data
-		Parameters
-			ds : data set name or list  with data
-		"""
-		if type(ds) == str:
-			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-			assert self.metaData[ds].dtype == DataSetMetaData.dtypeCat, "data set {} is expected to be categorical type for this operation".format(ds)
-			data =   self.dataSets[ds]
-		elif type(ds) == list:
-			assert isCategorical(ds), "data is not categorical"
-			data = ds
-		else:
-			raise "invalid type, expecting data set name or list"
-		return data
-	def getAnyData(self, ds):
-		"""
-		get any data
-		Parameters
-			ds : data set name or list  with data
-		"""
-		if type(ds) == str:
-			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-			data =   self.dataSets[ds]
-		elif type(ds) == list:
-			data = ds
-		else:
-			raise "invalid type, expecting data set name or list"
-		return data
-	def loadCatFloatDataFrame(self, ds1, ds2):
-		"""
-		loads float and cat data into data frame
-		Parameters
-			ds1: data set name or list
-			ds2: data set name or list or numpy array
-		"""
-		data1 = self.getCatData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		df1 = pd.DataFrame(data=data1)
-		df2 = pd.DataFrame(data=data2)
-		df = pd.concat([df1,df2], axis=1)
-		df.columns = range(df.shape[1])
-		return df
-	def showNames(self):
-		"""
-		lists data set names
-		"""
-		self.__printBanner("listing data set names")
-		names = self.dataSets.keys()
-		if self.verbose:
-			print("data sets")
-			for ds in names:
-				print(ds)
-		self.__printDone()
-		return names
-	def plot(self, ds, yscale=None):
-		"""
-		plots data
-		Parameters
-			ds: data set name or list or numpy array
-			yscale: y scale
-		"""
-		self.__printBanner("plotting data", ds)
-		data = self.getNumericData(ds)
-		drawLine(data, yscale)
-	def plotZoomed(self, ds, beg, end, yscale=None):
-		"""
-		plots zoomed data
-		Parameters
-			ds: data set name or list or numpy array
-			beg: begin offset
-			end: end offset
-			yscale: y scale
-		"""
-		self.__printBanner("plotting data", ds)
-		data = self.getNumericData(ds)
-		drawLine(data[beg:end], yscale)
-	def scatterPlot(self, ds1, ds2):
-		"""
-		scatter plots data
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-		"""
-		self.__printBanner("scatter plotting data", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		x = np.arange(1, len(data1)+1, 1)
-		plt.scatter(x, data1 ,color="red")
-		plt.scatter(x, data2 ,color="blue")
-		plt.show()
-	def print(self, ds):
-		"""
-		prunt data
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("printing data", ds)
-		assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-		data =   self.dataSets[ds]
-		if self.verbore:
-			print(formatAny(len(data), "size"))
-			print("showing first 50 elements" )
-			print(data[:50])
-	def plotHist(self, ds, cumulative, density, nbins=20):
-		"""
-		plots histogram
-		Parameters
-			ds: data set name or list or numpy array
-			cumulative : True if cumulative
-			density : True to normalize for probability density
-			nbins : no of bins
-		"""
-		self.__printBanner("plotting histogram", ds)
-		data = self.getNumericData(ds)
-		plt.hist(data, bins=nbins, cumulative=cumulative, density=density)
-		plt.show()
-	def isMonotonicallyChanging(self, ds):
-		"""
-		checks if monotonically increasing or decreasing
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("checking  monotonic change", ds)
-		data = self.getNumericData(ds)
-		monoIncreasing = all(list(map(lambda i : data[i] >= data[i-1], range(1, len(data), 1))))
-		monoDecreasing = all(list(map(lambda i : data[i] <= data[i-1], range(1, len(data), 1))))
-		result = self.__printResult("monoIncreasing", monoIncreasing, "monoDecreasing", monoDecreasing)
-		return result
-	def getFreqDistr(self, ds,  nbins=20):
-		"""
-		get histogram
-		Parameters
-			ds: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting histogram", ds)
-		data = self.getNumericData(ds)
-		frequency, lowLimit, binsize, extraPoints = sta.relfreq(data, numbins=nbins)
-		result = self.__printResult("frequency", frequency, "lowLimit", lowLimit, "binsize", binsize, "extraPoints", extraPoints)
-		return result
-	def getCumFreqDistr(self, ds,  nbins=20):
-		"""
-		get cumulative freq distribution
-		Parameters
-			ds: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting cumulative freq distribution", ds)
-		data = self.getNumericData(ds)
-		cumFrequency, lowLimit, binsize, extraPoints = sta.cumfreq(data, numbins=nbins)
-		result = self.__printResult("cumFrequency", cumFrequency, "lowLimit", lowLimit, "binsize", binsize, "extraPoints", extraPoints)
-		return result
-	def getExtremeValue(self, ds,  ensamp, nsamp, polarity, doPlotDistr, nbins=20):
-		"""
-		get extreme values
-		Parameters
-			ds: data set name or list or numpy array
-			ensamp: num of samples for extreme values
-			nsamp: num of samples
-			polarity: max or min
-			doPlotDistr: plot distr
-			nbins: num of bins
-		"""
-		self.__printBanner("getting extreme values", ds)
-		data = self.getNumericData(ds)
-		evalues = list()
-		for _ in range(ensamp):
-			values = selectRandomSubListFromListWithRepl(data, nsamp)
-			if polarity == "max":
-				evalues.append(max(values))
-			else:
-				evalues.append(min(values))
-		if doPlotDistr:
-			plt.hist(evalues, bins=nbins, cumulative=False, density=True)
-			plt.show()
-		result = self.__printResult("extremeValues", evalues)
-		return result
-	def getEntropy(self, ds,  nbins=20):
-		"""
-		get entropy
-		Parameters
-			ds: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting entropy", ds)
-		data = self.getNumericData(ds)
-		result = self.getFreqDistr(data, nbins)
-		entropy = sta.entropy(result["frequency"])
-		result = self.__printResult("entropy", entropy)
-		return result
-	def getRelEntropy(self, ds1,  ds2, nbins=20):
-		"""
-		get relative entropy or KL divergence with both data sets numeric
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting relative entropy or KL divergence", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		result1 = self .getFeqDistr(data1, nbins)
-		freq1  = result1["frequency"]
-		result2 = self .getFeqDistr(data2, nbins)
-		freq2  = result2["frequency"]
-		entropy = sta.entropy(freq1, freq2)
-		result = self.__printResult("relEntropy", entropy)
-		return result
-	def getAnyEntropy(self, ds,  dt, nbins=20):
-		"""
-		get entropy of any data typr numeric or categorical
-		Parameters
-			ds: data set name or list or numpy array
-			dt : data type num or cat
-			nbins: num of bins
-		"""
-		entropy = self.getEntropy(ds, nbins)["entropy"] if dt == "num" else self.getStatsCat(ds)["entropy"]
-		result = self.__printResult("entropy", entropy)
-		return result
-	def getJointEntropy(self, ds1, ds2, nbins=20):
-		"""
-		get joint entropy with both data sets numeric
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting join entropy", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		hist, xedges, yedges = np.histogram2d(data1, data2, bins=nbins)
-		hist = hist.flatten()
-		ssize = len(data1)
-		hist = hist / ssize
-		entropy = sta.entropy(hist)
-		result = self.__printResult("jointEntropy", entropy)
-		return result
-	def getAllNumMutualInfo(self, ds1,  ds2, nbins=20):
-		"""
-		get mutual information for both numeric data
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			nbins: num of bins
-		"""
-		self.__printBanner("getting mutual information", ds1, ds2)
-		en1 = self.getEntropy(ds1,nbins)
-		en2 = self.getEntropy(ds2,nbins)
-		en = self.getJointEntropy(ds1, ds2, nbins)
-		mutInfo = en1["entropy"] + en2["entropy"] - en["jointEntropy"]
-		result = self.__printResult("mutInfo", mutInfo)
-		return result
-	def getNumCatMutualInfo(self, nds, cds ,nbins=20):
-		"""
-		get mutiual information between numeric and categorical data
-		Parameters
-			nds: numeric data set name or list or numpy array
-			cds: categoric data set name or list
-			nbins: num of bins
-		"""
-		self.__printBanner("getting mutual information of numerical and categorical data", nds, cds)
-		ndata = self.getNumericData(nds)
-		cds = self.getCatData(cds)
-		nentr = self.getEntropy(nds)["entropy"]
-		#conditional entropy
-		cdistr = self.getStatsCat(cds)["distr"]
-		grdata = self.getGroupByData(nds, cds, True)["groupedData"]
-		cnentr = 0
-		for gr, data in grdata.items():
-			self.addListNumericData(data, "grdata")
-			gnentr = self.getEntropy("grdata")["entropy"]
-			cnentr += gnentr * cdistr[gr]
-		mutInfo = nentr - cnentr
-		result = self.__printResult("mutInfo", mutInfo, "entropy", nentr, "condEntropy", cnentr)
-		return result
-	def getTwoCatMutualInfo(self, cds1, cds2):
-		"""
-		get mutiual information between 2 categorical data sets
-		Parameters
-			cds1 : categoric data set name or list
-			cds2 : categoric data set name or list
-		"""
-		self.__printBanner("getting mutual information of two categorical data sets", cds1, cds2)
-		cdata1 = self.getCatData(cds1)
-		cdata2 = self.getCatData(cds1)
-		centr = self.getStatsCat(cds1)["entropy"]
-		#conditional entropy
-		cdistr = self.getStatsCat(cds2)["distr"]
-		grdata = self.getGroupByData(cds1, cds2, True)["groupedData"]
-		ccentr = 0
-		for gr, data in grdata.items():
-			self.addListCatData(data, "grdata")
-			gcentr = self.getStatsCat("grdata")["entropy"]
-			ccentr += gcentr * cdistr[gr]
-		mutInfo = centr - ccentr
-		result = self.__printResult("mutInfo", mutInfo, "entropy", centr, "condEntropy", ccentr)
-		return result
-	def getMutualInfo(self, dst, nbins=20):
-		"""
-		get mutiual information between 2 data sets,any combination numerical and categorical
-		Parameters
-			dst : data source , data type, data source , data type
-			nbins : num of bins
-		"""
-		assertEqual(len(dst), 4, "invalid data source and data type list size")
-		dtypes = ["num", "cat"]
-		assertInList(dst[1], dtypes, "invalid data type")
-		assertInList(dst[3], dtypes, "invalid data type")
-		self.__printBanner("getting mutual information of any mix numerical and categorical data", dst[0], dst[2])
-		if dst[1] == "num":
-			mutInfo = self.getAllNumMutualInfo(dst[0], dst[2], nbins)["mutInfo"] if dst[3] == "num" \
-			else self.getNumCatMutualInfo(dst[0], dst[2], nbins)["mutInfo"]
-		else:
-			mutInfo = self.getNumCatMutualInfo(dst[2], dst[0], nbins)["mutInfo"] if dst[3] == "num" \
-			else self.getTwoCatMutualInfo(dst[2], dst[0])["mutInfo"]
-		result = self.__printResult("mutInfo", mutInfo)
-		return result
-	def getCondMutualInfo(self, dst, nbins=20):
-		"""
-		get conditional  mutiual information between 2 data sets,any combination numerical and categorical
-		Parameters
-			dst : data source , data type, data source , data type, data source , data type
-			nbins : num of bins
-		"""
-		assertEqual(len(dst), 6, "invalid data source and data type list size")
-		dtypes = ["num", "cat"]
-		assertInList(dst[1], dtypes, "invalid data type")
-		assertInList(dst[3], dtypes, "invalid data type")
-		assertInList(dst[5], dtypes, "invalid data type")
-		self.__printBanner("getting conditional mutual information of any mix numerical and categorical data", dst[0], dst[2])
-		if dst[5] == "cat":
-			cdistr = self.getStatsCat(dst[4])["distr"]
-			grdata1 = self.getGroupByData(dst[0], dst[4], True)["groupedData"]
-			grdata2 = self.getGroupByData(dst[2], dst[4], True)["groupedData"]
-		else:
-			gdata = self.getNumericData(dst[4])
-			hist = Histogram.createWithNumBins(gdata, nbins)
-			cdistr = hist.distr()
-			grdata1 = self.getGroupByData(dst[0], dst[4], False)["groupedData"]
-			grdata2 = self.getGroupByData(dst[2], dst[4], False)["groupedData"]
-		cminfo = 0
-		for gr in grdata1.keys():
-			data1 = grdata1[gr]
-			data2 = grdata2[gr]
-			if dst[1] == "num":
-				self.addListNumericData(data1, "grdata1")
-			else:
-				self.addListCatData(data1, "grdata1")
-			if dst[3] == "num":
-				self.addListNumericData(data2, "grdata2")
-			else:
-				self.addListCatData(data2, "grdata2")
-			gdst = ["grdata1", dst[1], "grdata2", dst[3]]
-			minfo = self.getMutualInfo(gdst, nbins)["mutInfo"]
-			cminfo += minfo * cdistr[gr]
-		result = self.__printResult("condMutInfo", cminfo)
-		return result
-	def getPercentile(self, ds, value):
-		"""
-		gets percentile
-		Parameters
-			ds: data set name or list or numpy array
-			value: the value
-		"""
-		self.__printBanner("getting percentile", ds)
-		data = self.getNumericData(ds)
-		percent = sta.percentileofscore(data, value)
-		result = self.__printResult("value", value, "percentile", percent)
-		return result
-	def getValueRangePercentile(self, ds, value1, value2):
-		"""
-		gets percentile
-		Parameters
-			ds: data set name or list or numpy array
-			value1: first value
-			value2: second value
-		"""
-		self.__printBanner("getting percentile difference for value range", ds)
-		if value1 < value2:
-			v1 = value1
-			v2 = value2
-		else:
-			v1 = value2
-			v2 = value1
-		data = self.getNumericData(ds)
-		per1 = sta.percentileofscore(data, v1)
-		per2 = sta.percentileofscore(data, v2)
-		result = self.__printResult("valueFirst", value1, "valueSecond", value2, "percentileDiff", per2 - per1)
-		return result
-	def getValueAtPercentile(self, ds, percent):
-		"""
-		gets value at percentile
-		Parameters
-			ds: data set name or list or numpy array
-			percent: percentile
-		"""
-		self.__printBanner("getting value at percentile", ds)
-		data = self.getNumericData(ds)
-		assert isInRange(percent, 0, 100), "percent should be between 0 and 100"
-		value = sta.scoreatpercentile(data, percent)
-		result = self.__printResult("value", value, "percentile", percent)
-		return result
-	def getLessThanValues(self, ds, cvalue):
-		"""
-		gets values less than given value
-		Parameters
-			ds: data set name or list or numpy array
-			cvalue: condition value
-		"""
-		self.__printBanner("getting values less than", ds)
-		fdata = self.__getCondValues(ds, cvalue, "lt")
-		result = self.__printResult("count", len(fdata),  "lessThanvalues", fdata )
-		return result
-	def getGreaterThanValues(self, ds, cvalue):
-		"""
-		gets values greater than given value
-		Parameters
-			ds: data set name or list or numpy array
-			cvalue: condition value
-		"""
-		self.__printBanner("getting values greater than", ds)
-		fdata = self.__getCondValues(ds, cvalue, "gt")
-		result = self.__printResult("count", len(fdata), "greaterThanvalues", fdata )
-		return result
-	def __getCondValues(self, ds, cvalue, cond):
-		"""
-		gets cinditional values
-		Parameters
-			ds: data set name or list or numpy array
-			cvalue: condition value
-			cond: condition
-		"""
-		data = self.getNumericData(ds)
-		if cond == "lt":
-			ind = np.where(data < cvalue)
-		else:
-			ind = np.where(data > cvalue)
-		fdata = data[ind]
-		return fdata
-	def getUniqueValueCounts(self, ds, maxCnt=10):
-		"""
-		gets unique values and counts
-		Parameters
-			ds: data set name or list or numpy array
-			maxCnt; max value count pairs to return
-		"""
-		self.__printBanner("getting unique values and counts", ds)
-		data = self.getNumericData(ds)
-		values, counts = sta.find_repeats(data)
-		cardinality = len(values)
-		vc = list(zip(values, counts))
-		vc.sort(key = lambda v : v[1], reverse = True)
-		result = self.__printResult("cardinality", cardinality,  "vunique alues and repeat counts", vc[:maxCnt])
-		return result
-	def getCatUniqueValueCounts(self, ds, maxCnt=10):
-		"""
-		gets unique categorical values and counts
-		Parameters
-			ds: data set name or list or numpy array
-			maxCnt: max value count pairs to return
-		"""
-		self.__printBanner("getting unique categorical values and counts", ds)
-		data = self.getCatData(ds)
-		series = pd.Series(data)
-		uvalues = series.value_counts()
-		values = uvalues.index.tolist()
-		counts = uvalues.tolist()
-		vc = list(zip(values, counts))
-		vc.sort(key = lambda v : v[1], reverse = True)
-		result = self.__printResult("cardinality", len(values),  "unique values and repeat counts", vc[:maxCnt])
-		return result
-	def getCatAlphaValueCounts(self, ds):
-		"""
-		gets alphabetic value count
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting alphabetic value counts", ds)
-		data = self.getCatData(ds)
-		series = pd.Series(data)
-		flags = series.str.isalpha().tolist()
-		count = sum(flags)
-		result = self.__printResult("alphabeticValueCount", count)
-		return result
-	def getCatNumValueCounts(self, ds):
-		"""
-		gets numeric value count
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting numeric value counts", ds)
-		data = self.getCatData(ds)
-		series = pd.Series(data)
-		flags = series.str.isnumeric().tolist()
-		count = sum(flags)
-		result = self.__printResult("numericValueCount", count)
-		return result
-	def getCatAlphaNumValueCounts(self, ds):
-		"""
-		gets alpha numeric value count
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting alpha numeric value counts", ds)
-		data = self.getCatData(ds)
-		series = pd.Series(data)
-		flags = series.str.isalnum().tolist()
-		count = sum(flags)
-		result = self.__printResult("alphaNumericValueCount", count)
-		return result
-	def getCatAllCharCounts(self, ds):
-		"""
-		gets alphabetic, numeric and special char count list
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting alphabetic, numeric and special  char counts", ds)
-		data = self.getCatData(ds)
-		counts = list()
-		for d in data:
-			r = getAlphaNumCharCount(d)
-			counts.append(r)
-		result = self.__printResult("allTypeCharCounts", counts)
-		return result
-	def getCatAlphaCharCounts(self, ds):
-		"""
-		gets alphabetic char count list
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting alphabetic char counts", ds)
-		data = self.getCatData(ds)
-		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
-		counts = list(map(lambda r : r[0], counts))
-		result = self.__printResult("alphaCharCounts", counts)
-		return result
-	def getCatNumCharCounts(self, ds):
-		"""
-		gets numeric char count list
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting numeric char counts", ds)
-		data = self.getCatData(ds)
-		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
-		counts = list(map(lambda r : r[1], counts))
-		result = self.__printResult("numCharCounts", counts)
-		return result
-	def getCatSpecialCharCounts(self, ds):
-		"""
-		gets special char count list
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting special char counts", ds)
-		counts = self.getCatAllCharCounts(ds)["allTypeCharCounts"]
-		counts = list(map(lambda r : r[2], counts))
-		result = self.__printResult("specialCharCounts", counts)
-		return result
-	def getCatAlphaCharCountStats(self, ds):
-		"""
-		gets alphabetic char count stats
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting alphabetic char count stats", ds)
-		counts = self.getCatAlphaCharCounts(ds)["alphaCharCounts"]
-		nz = counts.count(0)
-		st = self.__getBasicStats(np.array(counts))
-		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
-		return result
-	def getCatNumCharCountStats(self, ds):
-		"""
-		gets numeric char count stats
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting numeric char count stats", ds)
-		counts = self.getCatNumCharCounts(ds)["numCharCounts"]
-		nz = counts.count(0)
-		st = self.__getBasicStats(np.array(counts))
-		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
-		return result
-	def getCatSpecialCharCountStats(self, ds):
-		"""
-		gets special char count stats
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting special char count stats", ds)
-		counts = self.getCatSpecialCharCounts(ds)["specialCharCounts"]
-		nz = counts.count(0)
-		st = self.__getBasicStats(np.array(counts))
-		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
-		return result
-	def getCatFldLenStats(self, ds):
-		"""
-		gets field length stats
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting field length stats", ds)
-		data = self.getCatData(ds)
-		le = list(map(lambda d: len(d), data))
-		st = self.__getBasicStats(np.array(le))
-		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3])
-		return result
-	def getCatCharCountStats(self, ds, ch):
-		"""
-		gets specified char ocuurence count stats
-		Parameters
-			ds: data set name or list or numpy array
-			ch : character
-		"""
-		self.__printBanner("getting field length stats", ds)
-		data = self.getCatData(ds)
-		counts = list(map(lambda d: d.count(ch), data))
-		nz = counts.count(0)
-		st = self.__getBasicStats(np.array(counts))
-		result = self.__printResult("mean", st[0], "std dev", st[1], "max", st[2], "min", st[3], "zeroCount", nz)
-		return result
-	def getStats(self, ds, nextreme=5):
-		"""
-		gets summary statistics
-		Parameters
-			ds: data set name or list or numpy array
-			nextreme: num of extreme values
-		"""
-		self.__printBanner("getting summary statistics", ds)
-		data = self.getNumericData(ds)
-		stat = dict()
-		stat["length"] = len(data)
-		stat["min"] = data.min()
-		stat["max"] = data.max()
-		series = pd.Series(data)
-		stat["n smallest"] = series.nsmallest(n=nextreme).tolist()
-		stat["n largest"] = series.nlargest(n=nextreme).tolist()
-		stat["mean"] = data.mean()
-		stat["median"] = np.median(data)
-		mode, modeCnt = sta.mode(data)
-		stat["mode"] = mode[0]
-		stat["mode count"] = modeCnt[0]
-		stat["std"] = np.std(data)
-		stat["skew"] = sta.skew(data)
-		stat["kurtosis"] = sta.kurtosis(data)
-		stat["mad"] = sta.median_absolute_deviation(data)
-		self.pp.pprint(stat)
-		return stat
-	def getStatsCat(self, ds):
-		"""
-		gets summary statistics for categorical data
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting summary statistics for categorical data", ds)
-		data = self.getCatData(ds)
-		ch = CatHistogram()
-		for d in data:
-			ch.add(d)
-		mode = ch.getMode()
-		entr = ch.getEntropy()
-		uvalues = ch.getUniqueValues()
-		distr = ch.getDistr()
-		result = self.__printResult("entropy", entr, "mode", mode, "uniqueValues", uvalues, "distr", distr)
-		return result
-	def getGroupByData(self, ds, gds, gdtypeCat, numBins=20):
-		"""
-		group by
-		Parameters
-			ds: data set name or list or numpy array
-			gds: group by data set name or list or numpy array
-			gdtpe : group by data type
-		"""
-		self.__printBanner("getting group by data", ds)
-		data = self.getAnyData(ds)
-		if gdtypeCat:
-			gdata = self.getCatData(gds)
-		else:
-			gdata = self.getNumericData(gds)
-			hist = Histogram.createWithNumBins(gdata, numBins)
-			gdata = list(map(lambda d : hist.bin(d), gdata))
-		self.ensureSameSize([data, gdata])
-		groups = dict()
-		for g,d in zip(gdata, data):
-			appendKeyedList(groups, g, d)
-		ve = self.verbose
-		self.verbose = False
-		result = self.__printResult("groupedData", groups)
-		self.verbose = ve
-		return result
-	def getDifference(self, ds, order, doPlot=False):
-		"""
-		gets difference of given order
-		Parameters
-			ds: data set name or list or numpy array
-			order: order of difference
-			doPlot : True for plot
-		"""
-		self.__printBanner("getting difference of given order", ds)
-		data = self.getNumericData(ds)
-		diff = difference(data, order)
-		if doPlot:
-			drawLine(diff)
-		return diff
-	def getTrend(self, ds, doPlot=False):
-		"""
-		get trend
-		Parameters
-			ds: data set name or list or numpy array
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("getting trend")
-		data = self.getNumericData(ds)
-		sz = len(data)
-		X = list(range(0, sz))
-		X = np.reshape(X, (sz, 1))
-		model = LinearRegression()
-		model.fit(X, data)
-		trend = model.predict(X)
-		sc = model.score(X, data)
-		coef = model.coef_
-		intc = model.intercept_
-		result = self.__printResult("coeff", coef, "intercept", intc,  "r square error", sc,  "trend", trend)
-		if doPlot:
-			plt.plot(data)
-			plt.plot(trend)
-			plt.show()
-		return result
-	def getDiffSdNoisiness(self, ds):
-		"""
-		get noisiness based on std dev of first order difference
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		diff = self.getDifference(ds, 1)
-		noise = np.std(np.array(diff))
-		result = self.__printResult("noisiness", noise)
-		return result
-	def getMaRmseNoisiness(self, ds, wsize=5):
-		"""
-		gets noisiness based on RMSE with moving average
-		Parameters
-			ds: data set name or list or numpy array
-			wsize : window size
-		"""
-		assert wsize % 2 == 1, "window size must be odd"
-		data = self.getNumericData(ds)
-		wind = data[:wsize]
-		wstat = SlidingWindowStat.initialize(wind.tolist())
-		whsize = int(wsize / 2)
-		beg = whsize
-		end = len(data) - whsize - 1
-		sumSq = 0.0
-		mean = wstat.getStat()[0]
-		diff = data[beg] - mean
-		sumSq += diff * diff
-		for i in range(beg + 1, end, 1):
-			mean = wstat.addGetStat(data[i + whsize])[0]
-			diff = data[i] - mean
-			sumSq += (diff * diff)
-		noise = math.sqrt(sumSq / (len(data) - 2 * whsize))
-		result = self.__printResult("noisiness", noise)
-		return result
-	def deTrend(self, ds, trend, doPlot=False):
-		"""
-		de trend
-		Parameters
-			ds: data set name or list or numpy array
-			ternd : trend data
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("doing de trend", ds)
-		data = self.getNumericData(ds)
-		sz = len(data)
-		detrended =  list(map(lambda i : data[i]-trend[i], range(sz)))
-		if doPlot:
-			drawLine(detrended)
-		return detrended
-	def getTimeSeriesComponents(self, ds, model, freq, summaryOnly, doPlot=False):
-		"""
-		extracts trend, cycle and residue components of time series
-		Parameters
-			ds: data set name or list or numpy array
-			model : model type
-			freq : seasnality period
-			summaryOnly : True if only summary needed in output
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("extracting trend, cycle and residue components of time series", ds)
-		assert model == "additive" or model == "multiplicative", "model must be additive or multiplicative"
-		data = self.getNumericData(ds)
-		res = seasonal_decompose(data, model=model, period=freq)
-		if doPlot:
-			res.plot()
-			plt.show()
-		#summar of componenets
-		trend = np.array(removeNan(res.trend))
-		trendMean = trend.mean()
-		trendSlope = (trend[-1] - trend[0]) / (len(trend) - 1)
-		seasonal = np.array(removeNan(res.seasonal))
-		seasonalAmp = (seasonal.max() - seasonal.min()) / 2
-		resid = np.array(removeNan(res.resid))
-		residueMean = resid.mean()
-		residueStdDev = np.std(resid)
-		if summaryOnly:
-			result = self.__printResult("trendMean", trendMean, "trendSlope", trendSlope, "seasonalAmp", seasonalAmp,
-			"residueMean", residueMean, "residueStdDev", residueStdDev)
-		else:
-			result = self.__printResult("trendMean", trendMean, "trendSlope", trendSlope, "seasonalAmp", seasonalAmp,
-			"residueMean", residueMean, "residueStdDev", residueStdDev, "trend", res.trend, "seasonal", res.seasonal,
-			"residual", res.resid)
-		return result
-	def getGausianMixture(self, ncomp, cvType, ninit, *dsl):
-		"""
-		finds gaussian mixture parameters
-		Parameters
-			ncomp : num of gaussian componenets
-			cvType : co variance type
-			ninit: num of intializations
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting gaussian mixture parameters", *dsl)
-		assertInList(cvType, ["full", "tied", "diag", "spherical"], "invalid covariance type")
-		dmat = self.__stackData(*dsl)
-		gm = GaussianMixture(n_components=ncomp,  covariance_type=cvType, n_init=ninit)
-		gm.fit(dmat)
-		weights = gm.weights_
-		means = gm.means_
-		covars = gm.covariances_
-		converged = gm.converged_
-		niter = gm.n_iter_
-		aic = gm.aic(dmat)
-		result = self.__printResult("weights", weights, "mean", means, "covariance", covars, "converged", converged, "num iterations", niter, "aic", aic)
-		return result
-	def getKmeansCluster(self, nclust, ninit, *dsl):
-		"""
-		gets cluster parameters
-		Parameters
-			nclust : num of clusters
-			ninit: num of intializations
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting kmean cluster parameters", *dsl)
-		dmat = self.__stackData(*dsl)
-		nsamp = dmat.shape[0]
-		km = KMeans(n_clusters=nclust, n_init=ninit)
-		km.fit(dmat)
-		centers = km.cluster_centers_
-		avdist = sqrt(km.inertia_ / nsamp)
-		niter = km.n_iter_
-		score = km.score(dmat)
-		result = self.__printResult("centers", centers, "average distance", avdist, "num iterations", niter, "score", score)
-		return result
-	def getPrincComp(self, ncomp, *dsl):
-		"""
-		finds pricipal componenet parameters
-		Parameters
-			ncomp : num of pricipal componenets
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting principal componenet parameters", *dsl)
-		dmat = self.__stackData(*dsl)
-		nfeat = dmat.shape[1]
-		assertGreater(nfeat, 1, "requires multiple features")
-		assertLesserEqual(ncomp, nfeat, "num of componenets greater than num of features")
-		pca = PCA(n_components=ncomp)
-		pca.fit(dmat)
-		comps = pca.components_
-		var = pca.explained_variance_
-		varr = pca.explained_variance_ratio_
-		svalues = pca.singular_values_
-		result = self.__printResult("componenets", comps, "variance", var, "variance ratio", varr, "singular values", svalues)
-		return result
-	def getOutliersWithIsoForest(self, contamination,  *dsl):
-		"""
-		finds outliers using isolation forest
-		Parameters
-			contamination : proportion of outliers in the data set
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting outliers using isolation forest", *dsl)
-		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
-		dmat = self.__stackData(*dsl)
-		isf = IsolationForest(contamination=contamination, behaviour="new")
-		ypred = isf.fit_predict(dmat)
-		mask = ypred == -1
-		doul = dmat[mask, :]
-		mask = ypred != -1
-		dwoul = dmat[mask, :]
-		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
-		return result
-	def getOutliersWithLocalFactor(self, contamination,  *dsl):
-		"""
-		gets outliers using local outlier factor
-		Parameters
-			contamination : proportion of outliers in the data set
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting outliers using local outlier factor", *dsl)
-		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
-		dmat = self.__stackData(*dsl)
-		lof = LocalOutlierFactor(contamination=contamination)
-		ypred = lof.fit_predict(dmat)
-		mask = ypred == -1
-		doul = dmat[mask, :]
-		mask = ypred != -1
-		dwoul = dmat[mask, :]
-		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
-		return result
-	def getOutliersWithSupVecMach(self, nu,  *dsl):
-		"""
-		gets outliers using one class svm
-		Parameters
-			nu : upper bound on the fraction of training errors and a lower bound of the fraction of support vectors
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting outliers using one class svm", *dsl)
-		assert nu >= 0 and nu <= 0.5, "error upper bound outside valid range"
-		dmat = self.__stackData(*dsl)
-		svm = OneClassSVM(nu=nu)
-		ypred = svm.fit_predict(dmat)
-		mask = ypred == -1
-		doul = dmat[mask, :]
-		mask = ypred != -1
-		dwoul = dmat[mask, :]
-		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
-		return result
-	def getOutliersWithCovarDeterminant(self, contamination,  *dsl):
-		"""
-		gets outliers using covariance determinan
-		Parameters
-			contamination : proportion of outliers in the data set
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting outliers using using covariance determinant", *dsl)
-		assert contamination >= 0 and contamination <= 0.5, "contamination outside valid range"
-		dmat = self.__stackData(*dsl)
-		lof = EllipticEnvelope(contamination=contamination)
-		ypred = lof.fit_predict(dmat)
-		mask = ypred == -1
-		doul = dmat[mask, :]
-		mask = ypred != -1
-		dwoul = dmat[mask, :]
-		result = self.__printResult("numOutliers", doul.shape[0], "outliers", doul, "dataWithoutOutliers", dwoul)
-		return result
-	def getOutliersWithZscore(self, ds, zthreshold, stats=None):
-		"""
-		gets outliers using zscore
-		Parameters
-			ds: data set name or list or numpy array
-			zthreshold : z score threshold
-			stats : tuple cintaining mean and std dev
-		"""
-		self.__printBanner("getting outliers using zscore", ds)
-		data = self.getNumericData(ds)
-		if stats is None:
-			mean = data.mean()
-			sd = np.std(data)
-		else:
-			mean = stats[0]
-			sd = stats[1]
-		zs = list(map(lambda d : abs((d - mean) / sd), data))
-		outliers = list(filter(lambda r : r[1] > zthreshold, enumerate(zs)))
-		result = self.__printResult("outliers", outliers)
-		return result
-	def getOutliersWithRobustZscore(self, ds, zthreshold, stats=None):
-		"""
-		gets outliers using robust zscore
-		Parameters
-			ds: data set name or list or numpy array
-			zthreshold : z score threshold
-			stats : tuple containing median and median absolute deviation
-		"""
-		self.__printBanner("getting outliers using robust zscore", ds)
-		data = self.getNumericData(ds)
-		if stats is None:
-			med = np.median(data)
-			dev = np.array(list(map(lambda d : abs(d - med), data)))
-			mad = 1.4296 *  np.median(dev)
-		else:
-			med = stats[0]
-			mad = stats[1]
-		rzs = list(map(lambda d : abs((d - med) / mad), data))
-		outliers = list(filter(lambda r : r[1] > zthreshold, enumerate(rzs)))
-		result = self.__printResult("outliers", outliers)
-		return result
-	def getSubsequenceOutliersWithDissimilarity(self, subSeqSize, ds):
-		"""
-		gets subsequence outlier with subsequence pairwise disimilarity
-		Parameters
-			subSeqSize : sub sequence size
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("doing sub sequence anomaly detection with dissimilarity", ds)
-		data = self.getNumericData(ds)
-		sz = len(data)
-		dist = dict()
-		minDist = dict()
-		for i in range(sz - subSeqSize):
-			#first window
-			w1 = data[i : i + subSeqSize]
-			dmin = None
-			for j in range(sz - subSeqSize):
-				#second window not overlapping with the first
-				if j + subSeqSize <=i or j >= i + subSeqSize:
-					w2 = data[j : j + subSeqSize]
-					k = (j,i)
-					if k in dist:
-						d = dist[k]
-					else:
-						d = euclideanDistance(w1,w2)
-						k = (i,j)
-						dist[k] = d
-					if dmin is None:
-						dmin = d
-					else:
-						dmin = d if d < dmin else dmin
-			minDist[i] = dmin
-		#find max of min
-		dmax = None
-		offset = None
-		for k in minDist.keys():
-			d = minDist[k]
-			if dmax is None:
-				dmax = d
-				offset = k
-			else:
-				if d > dmax:
-					dmax = d
-					offset = k
-		result = self.__printResult("subSeqOffset", offset, "outlierScore", dmax)
-		return result
-	def getNullCount(self, ds):
-		"""
-		get count of null fields
-		Parameters
-			ds : data set name or list or numpy array with data
-		"""
-		self.__printBanner("getting null value count", ds)
-		if type(ds) == str:
-			assert ds in self.dataSets, "data set {} does not exist, please add it first".format(ds)
-			data =  self.dataSets[ds]
-			ser = pd.Series(data)
-		elif type(ds) == list or type(ds) == np.ndarray:
-			ser = pd.Series(ds)
-			data = ds
-		else:
-			raise ValueError("invalid data type")
-		nv = ser.isnull().tolist()
-		nullCount = nv.count(True)
-		nullFraction = nullCount / len(data)
-		result = self.__printResult("nullFraction", nullFraction, "nullCount", nullCount)
-		return result
-	def fitLinearReg(self, dsx, ds, doPlot=False):
-		"""
-		fit  linear regression
-		Parameters
-			dsx: x data set name or None
-			ds: data set name or list or numpy array
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("fitting linear regression", ds)
-		data = self.getNumericData(ds)
-		if dsx is None:
-			x = np.arange(len(data))
-		else:
-			x = self.getNumericData(dsx)
-		slope, intercept, rvalue, pvalue, stderr = sta.linregress(x, data)
-		result = self.__printResult("slope", slope, "intercept", intercept, "rvalue", rvalue, "pvalue", pvalue, "stderr", stderr)
-		if doPlot:
-			self.regFitPlot(x, data, slope, intercept)
-		return result
-	def fitSiegelRobustLinearReg(self, ds, doPlot=False):
-		"""
-		siegel robust linear regression fit based on median
-		Parameters
-			ds: data set name or list or numpy array
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("fitting siegel robust linear regression  based on median", ds)
-		data = self.getNumericData(ds)
-		slope , intercept = sta.siegelslopes(data)
-		result = self.__printResult("slope", slope, "intercept", intercept)
-		if doPlot:
-			x = np.arange(len(data))
-			self.regFitPlot(x, data, slope, intercept)
-		return result
-	def fitTheilSenRobustLinearReg(self, ds, doPlot=False):
-		"""
-		thiel sen  robust linear fit regression based on median
-		Parameters
-			ds: data set name or list or numpy array
-			doPlot: true if plotting needed
-		"""
-		self.__printBanner("fitting thiel sen  robust linear regression based on median", ds)
-		data = self.getNumericData(ds)
-		slope, intercept, loSlope, upSlope = sta.theilslopes(data)
-		result = self.__printResult("slope", slope, "intercept", intercept, "lower slope", loSlope, "upper slope", upSlope)
-		if doPlot:
-			x = np.arange(len(data))
-			self.regFitPlot(x, data, slope, intercept)
-		return result
-	def plotRegFit(self, x, y, slope, intercept):
-		"""
-		plot linear rgeression fit line
-		Parameters
-			x : x values
-			y : y values
-			slope : slope
-			intercept : intercept
-		"""
-		self.__printBanner("plotting linear rgeression fit line")
-		fig = plt.figure()
-		ax = fig.add_subplot(111)
-		ax.plot(x, y, "b.")
-		ax.plot(x, intercept + slope * x, "r-")
-		plt.show()
-	def getRegFit(self, xvalues, yvalues, slope, intercept):
-		"""
-		gets fitted line and residue
-		Parameters
-			x : x values
-			y : y values
-			slope : regression slope
-			intercept : regressiob intercept
-		"""
-		yfit = list()
-		residue = list()
-		for x,y in zip(xvalues, yvalues):
-			yf = x * slope + intercept
-			yfit.append(yf)
-			r = y - yf
-			residue.append(r)
-		result = self.__printResult("fitted line", yfit, "residue", residue)
-		return result
-	def getInfluentialPoints(self, dsx, dsy):
-		"""
-		gets influential points in regression model with Cook's distance
-		Parameters
-			dsx : data set name or list or numpy array for x
-			dsy : data set name or list or numpy array for y
-		"""
-		self.__printBanner("finding influential points for linear regression", dsx, dsy)
-		y = self.getNumericData(dsy)
-		x = np.arange(len(data)) if dsx is None else self.getNumericData(dsx)
-		model = sm.OLS(y, x).fit()
-		np.set_printoptions(suppress=True)
-		influence = model.get_influence()
-		cooks = influence.cooks_distance
-		result = self.__printResult("Cook distance", cooks)
-		return result
-	def getCovar(self, *dsl):
-		"""
-		gets covariance
-		Parameters
-			dsl: list of data set name or list or numpy array
-		"""
-		self.__printBanner("getting covariance", *dsl)
-		data = list(map(lambda ds : self.getNumericData(ds), dsl))
-		self.ensureSameSize(data)
-		data = np.vstack(data)
-		cv = np.cov(data)
-		print(cv)
-		return cv
-	def getPearsonCorr(self, ds1, ds2, sigLev=.05):
-		"""
-		gets pearson correlation coefficient
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-		"""
-		self.__printBanner("getting pearson correlation coefficient ", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		stat, pvalue = sta.pearsonr(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def getSpearmanRankCorr(self, ds1, ds2, sigLev=.05):
-		"""
-		gets spearman correlation coefficient
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("getting spearman correlation coefficient",ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		stat, pvalue = sta.spearmanr(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def getKendalRankCorr(self, ds1, ds2, sigLev=.05):
-		"""
-		kendall’s tau, a correlation measure for ordinal data
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("getting kendall’s tau, a correlation measure for ordinal data", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		stat, pvalue = sta.kendalltau(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def getPointBiserialCorr(self, ds1, ds2, sigLev=.05):
-		"""
-		point biserial  correlation  between binary and numeric
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("getting point biserial correlation  between binary and numeric", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		assert isBinary(data1), "first data set is not binary"
-		self.ensureSameSize([data1, data2])
-		stat, pvalue = sta.pointbiserialr(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def getConTab(self, ds1, ds2):
-		"""
-		get contingency table for categorical data pair
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-		"""
-		self.__printBanner("getting contingency table for categorical data", ds1, ds2)
-		data1 = self.getCatData(ds1)
-		data2 = self.getCatData(ds2)
-		self.ensureSameSize([data1, data2])
-		crosstab = pd.crosstab(pd.Series(data1), pd.Series(data2), margins = False)
-		ctab = crosstab.values
-		print("contingency table")
-		print(ctab)
-		return ctab
-	def getChiSqCorr(self, ds1, ds2, sigLev=.05):
-		"""
-		chi square correlation for  categorical	data pair
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("getting chi square correlation for  two categorical", ds1, ds2)
-		ctab = self.getConTab(ds1, ds2)
-		stat, pvalue, dof, expctd = sta.chi2_contingency(ctab)
-		result = self.__printResult("stat", stat, "pvalue", pvalue, "dof", dof, "expected", expctd)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def getSizeCorrectChiSqCorr(self, ds1, ds2, chisq):
-		"""
-		cramerV size corrected chi square correlation for  categorical	data pair
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			chisq: chisq stat
-		"""
-		self.__printBanner("getting size corrected chi square correlation for  two categorical", ds1, ds2)
-		c1 = self.getCatUniqueValueCounts(ds1)["cardinality"]
-		c2 = self.getCatUniqueValueCounts(ds2)["cardinality"]
-		c = min(c1,c2)
-		assertGreater(c, 1, "min cardinality should be greater than 1")
-		l = len(self.getCatData(ds1))
-		t = l * (c - 1)
-		stat = math.sqrt(chisq / t)
-		result = self.__printResult("stat", stat)
-		return result
-	def getAnovaCorr(self, ds1, ds2, grByCol, sigLev=.05):
-		"""
-		anova correlation for  numerical categorical
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			grByCol : group by column
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("anova correlation for numerical categorical", ds1, ds2)
-		df = self.loadCatFloatDataFrame(ds1, ds2) if grByCol == 0 else self.loadCatFloatDataFrame(ds2, ds1)
-		grByCol = 0
-		dCol = 1
-		grouped = df.groupby([grByCol])
-		dlist =  list(map(lambda v : v[1].loc[:, dCol].values, grouped))
-		stat, pvalue = sta.f_oneway(*dlist)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably uncorrelated", "probably correlated", sigLev)
-		return result
-	def plotAutoCorr(self, ds, lags, alpha, diffOrder=0):
-		"""
-		plots auto correlation
-		Parameters
-			ds: data set name or list or numpy array
-			lags: num of lags
-			alpha: confidence level
-		"""
-		self.__printBanner("plotting auto correlation", ds)
-		data = self.getNumericData(ds)
-		ddata = difference(data, diffOrder) if diffOrder > 0 else data
-		tsaplots.plot_acf(ddata, lags = lags, alpha = alpha)
-		plt.show()
-	def getAutoCorr(self, ds, lags, alpha=.05):
-		"""
-		gets auts correlation
-		Parameters
-			ds: data set name or list or numpy array
-			lags: num of lags
-			alpha: confidence level
-		"""
-		self.__printBanner("getting auto correlation", ds)
-		data = self.getNumericData(ds)
-		autoCorr, confIntv  = stt.acf(data, nlags=lags, fft=False, alpha=alpha)
-		result = self.__printResult("autoCorr", autoCorr, "confIntv", confIntv)
-		return result
-	def plotParAcf(self, ds, lags, alpha):
-		"""
-		partial auto correlation
-		Parameters
-			ds: data set name or list or numpy array
-			lags: num of lags
-			alpha: confidence level
-		"""
-		self.__printBanner("plotting partial auto correlation", ds)
-		data = self.getNumericData(ds)
-		tsaplots.plot_pacf(data, lags = lags, alpha = alpha)
-		plt.show()
-	def getParAutoCorr(self, ds, lags, alpha=.05):
-		"""
-		gets partial auts correlation
-		Parameters
-			ds: data set name or list or numpy array
-			lags: num of lags
-			alpha: confidence level
-		"""
-		self.__printBanner("getting partial auto correlation", ds)
-		data = self.getNumericData(ds)
-		partAutoCorr, confIntv  = stt.pacf(data, nlags=lags, alpha=alpha)
-		result = self.__printResult("partAutoCorr", partAutoCorr, "confIntv", confIntv)
-		return result
-	def getHurstExp(self, ds, kind, doPlot=True):
-		"""
-		gets Hurst exponent of time series
-		Parameters
-			ds: data set name or list or numpy array
-			kind: kind of data change, random_walk, price
-			doPlot: True for plot
-		"""
-		self.__printBanner("getting Hurst exponent", ds)
-		data = self.getNumericData(ds)
-		h, c, odata = hurst.compute_Hc(data, kind=kind, simplified=False)
-		if doPlot:
-			f, ax = plt.subplots()
-			ax.plot(odata[0], c * odata[0] ** h, color="deepskyblue")
-			ax.scatter(odata[0], odata[1], color="purple")
-			ax.set_xscale("log")
-			ax.set_yscale("log")
-			ax.set_xlabel("time interval")
-			ax.set_ylabel("cum dev range and std dev ratio")
-			ax.grid(True)
-			plt.show()
-		result = self.__printResult("hurstExponent", h, "hurstConstant", c)
-		return result
-	def approxEntropy(self, ds, m, r):
-		"""
-		gets apprx entroty of time series (ref: wikipedia)
-		Parameters
-			ds: data set name or list or numpy array
-			m:  length of compared run of data
-			r: filtering level
-		"""
-		self.__printBanner("getting approximate entropy", ds)
-		ldata = self.getNumericData(ds)
-		aent = abs(self.__phi(ldata, m + 1, r) - self.__phi(ldata, m, r))
-		result = self.__printResult("approxEntropy", aent)
-		return result
-	def __phi(self, ldata, m, r):
-		"""
-		phi function for approximate entropy
-		Parameters
-			ldata: data array
-			m:  length of compared run of data
-			r: filtering level
-		"""
-		le = len(ldata)
-		x = [[ldata[j] for j in range(i, i + m - 1 + 1)] for i in range(le - m + 1)]
-		lex = len(x)
-		c = list()
-		for i in range(lex):
-			cnt = 0
-			for j in range(lex):
-				cnt += (1 if maxListDist(x[i], x[j]) <= r else 0)
-			cnt /= (le - m + 1.0)
-			c.append(cnt)
-		return sum(np.log(c)) / (le - m + 1.0)
-	def oneSpaceEntropy(self, ds, scaMethod="zscale"):
-		"""
-		gets one space  entroty  (ref:  Estimating mutual information by Kraskov)
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting one space entropy", ds)
-		data = self.getNumericData(ds)
-		sdata = sorted(data)
-		sdata = scaleData(sdata, scaMethod)
-		su = 0
-		n = len(sdata)
-		for i in range(1, n, 1):
-			t = abs(sdata[i] - sdata[i-1])
-			if t > 0:
-				su += log(t)
-		su /= (n -1)
-		#print(su)
-		ose = digammaFun(n) - digammaFun(1) + su
-		result = self.__printResult("entropy", ose)
-		return result
-	def plotCrossCorr(self, ds1, ds2, normed, lags):
-		"""
-		plots cross correlation
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			normed: If True, input vectors are normalised to unit
-			lags: num of lags
-		"""
-		self.__printBanner("plotting cross correlation between two numeric", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		plt.xcorr(data1, data2, normed=normed, maxlags=lags)
-		plt.show()
-	def getCrossCorr(self, ds1, ds2):
-		"""
-		gets cross correlation
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-		"""
-		self.__printBanner("getting cross correlation", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		self.ensureSameSize([data1, data2])
-		crossCorr = stt.ccf(data1, data2)
-		result = self.__printResult("crossCorr", crossCorr)
-		return result
-	def getFourierTransform(self, ds):
-		"""
-		gets fast fourier transform
-		Parameters
-			ds: data set name or list or numpy array
-		"""
-		self.__printBanner("getting fourier transform", ds)
-		data = self.getNumericData(ds)
-		ft = np.fft.rfft(data)
-		result = self.__printResult("fourierTransform", ft)
-		return result
-	def testStationaryAdf(self, ds, regression, autolag, sigLev=.05):
-		"""
-		Adf stationary test null hyp not stationary
-		Parameters
-			ds: data set name or list or numpy array
-			regression: constant and trend order to include in regression
-			autolag: method to use when automatically determining the lag
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing ADF stationary test", ds)
-		relist = ["c","ct","ctt","nc"]
-		assert regression in relist, "invalid regression value"
-		alList = ["AIC", "BIC", "t-stat", None]
-		assert autolag in alList, "invalid autolag value"
-		data = self.getNumericData(ds)
-		re = stt.adfuller(data, regression=regression, autolag=autolag)
-		result = self.__printResult("stat", re[0], "pvalue", re[1] , "num lags", re[2] , "num observation for regression", re[3],
-		"critial values", re[4])
-		self.__printStat(re[0], re[1], "probably not stationary", "probably stationary", sigLev)
-		return result
-	def testStationaryKpss(self, ds, regression, nlags, sigLev=.05):
-		"""
-		Kpss stationary test null hyp  stationary
-		Parameters
-			ds: data set name or list or numpy array
-			regression: constant and trend order to include in regression
-			nlags : no of lags
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing KPSS stationary test", ds)
-		relist = ["c","ct"]
-		assert regression in relist, "invalid regression value"
-		nlList =[None, "auto", "legacy"]
-		assert nlags in nlList or type(nlags) == int, "invalid nlags value"
-		data = self.getNumericData(ds)
-		stat, pvalue, nLags, criticalValues = stt.kpss(data, regression=regression, lags=nlags)
-		result = self.__printResult("stat", stat, "pvalue", pvalue, "num lags", nLags, "critial values", criticalValues)
-		self.__printStat(stat, pvalue, "probably stationary", "probably not stationary", sigLev)
-		return result
-	def testNormalJarqBera(self, ds, sigLev=.05):
-		"""
-		jarque bera normalcy test
-		Parameters
-			ds: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing ajrque bera normalcy test", ds)
-		data = self.getNumericData(ds)
-		jb, jbpv, skew, kurtosis =  sstt.jarque_bera(data)
-		result = self.__printResult("stat", jb, "pvalue", jbpv, "skew", skew, "kurtosis", kurtosis)
-		self.__printStat(jb, jbpv, "probably gaussian", "probably not gaussian", sigLev)
-		return result
-	def testNormalShapWilk(self, ds, sigLev=.05):
-		"""
-		shapiro wilks normalcy test
-		Parameters
-			ds: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing shapiro wilks normalcy test", ds)
-		data = self.getNumericData(ds)
-		stat, pvalue = sta.shapiro(data)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably gaussian", "probably not gaussian", sigLev)
-		return result
-	def testNormalDagast(self, ds, sigLev=.05):
-		"""
-		D’Agostino’s K square  normalcy test
-		Parameters
-			ds: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing D’Agostino’s K square  normalcy test", ds)
-		data = self.getNumericData(ds)
-		stat, pvalue = sta.normaltest(data)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably gaussian", "probably not gaussian", sigLev)
-		return result
-	def testDistrAnderson(self, ds, dist, sigLev=.05):
-		"""
-		Anderson test for normal, expon, logistic, gumbel, gumbel_l, gumbel_r
-		Parameters
-			ds: data set name or list or numpy array
-			dist: type of distribution
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Anderson test for for various distributions", ds)
-		diList = ["norm", "expon", "logistic", "gumbel", "gumbel_l", "gumbel_r", "extreme1"]
-		assert dist in diList, "invalid distribution"
-		data = self.getNumericData(ds)
-		re = sta.anderson(data)
-		slAlpha = int(100 * sigLev)
-		msg = "significnt value not found"
-		for i in range(len(re.critical_values)):
-			sl, cv = re.significance_level[i], re.critical_values[i]
-			if int(sl) == slAlpha:
-				if re.statistic < cv:
-					msg = "probably {} at the {:.3f} siginificance level".format(dist, sl)
-				else:
-					msg = "probably not {} at the {:.3f} siginificance level".format(dist, sl)
-		result = self.__printResult("stat", re.statistic, "test", msg)
-		print(msg)
-		return result
-	def testSkew(self, ds, sigLev=.05):
-		"""
-		test skew wrt  normal distr
-		Parameters
-			ds: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("testing skew wrt normal distr", ds)
-		data = self.getNumericData(ds)
-		stat, pvalue = sta.skewtest(data)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same skew as normal distribution", "probably not same skew as normal distribution", sigLev)
-		return result
-	def testTwoSampleStudent(self, ds1, ds2, sigLev=.05):
-		"""
-		student t 2 sample test
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing student t 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.ttest_ind(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-		return result
-	def testTwoSampleKs(self, ds1, ds2, sigLev=.05):
-		"""
-		Kolmogorov Sminov 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Kolmogorov Sminov 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.ks_2samp(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-	def testTwoSampleMw(self, ds1, ds2, sigLev=.05):
-		"""
-		Mann-Whitney  2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Mann-Whitney  2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.mannwhitneyu(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-	def testTwoSampleWilcox(self, ds1, ds2, sigLev=.05):
-		"""
-		Wilcoxon Signed-Rank 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Wilcoxon Signed-Rank 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.wilcoxon(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-	def testTwoSampleKw(self, ds1, ds2, sigLev=.05):
-		"""
-		Kruskal-Wallis 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Kruskal-Wallis 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.kruskal(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably snot ame distribution", sigLev)
-	def testTwoSampleFriedman(self, ds1, ds2, ds3, sigLev=.05):
-		"""
-		Friedman 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Friedman 2 sample  test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		data3 = self.getNumericData(ds3)
-		stat, pvalue = sta.friedmanchisquare(data1, data2, data3)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-	def testTwoSampleEs(self, ds1, ds2, sigLev=.05):
-		"""
-		Epps Singleton 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Epps Singleton 2 sample  test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.epps_singleton_2samp(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same distribution", "probably not same distribution", sigLev)
-	def testTwoSampleAnderson(self, ds1, ds2, sigLev=.05):
-		"""
-		Anderson 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Anderson 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		dseq = (data1, data2)
-		stat, critValues, sLev = sta.anderson_ksamp(dseq)
-		slAlpha = 100 * sigLev
-		if slAlpha == 10:
-			cv = critValues[1]
-		elif slAlpha == 5:
-			cv = critValues[2]
-		elif slAlpha == 2.5:
-			cv = critValues[3]
-		elif slAlpha == 1:
-			cv = critValues[4]
-		else:
-			cv = None
-		result = self.__printResult("stat", stat, "critValues", critValues, "critValue", cv, "significanceLevel", sLev)
-		print("stat:   {:.3f}".format(stat))
-		if cv is None:
-			msg = "critical values value not found for provided siginificance level"
-		else:
-			if stat < cv:
-				msg = "probably same distribution at the {:.3f} siginificance level".format(sigLev)
-			else:
-				msg = "probably not same distribution at the {:.3f} siginificance level".format(sigLev)
-		print(msg)
-		return result
-	def testTwoSampleScaleAb(self, ds1, ds2, sigLev=.05):
-		"""
-		Ansari Bradley 2 sample scale statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Ansari Bradley 2 sample scale test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.ansari(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same scale", "probably not same scale", sigLev)
-		return result
-	def testTwoSampleScaleMood(self, ds1, ds2, sigLev=.05):
-		"""
-		Mood 2 sample scale statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Mood 2 sample scale test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.mood(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same scale", "probably not same scale", sigLev)
-		return result
-	def testTwoSampleVarBartlet(self, ds1, ds2, sigLev=.05):
-		"""
-		Ansari Bradley 2 sample scale statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Ansari Bradley 2 sample scale test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.bartlett(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
-		return result
-	def testTwoSampleVarLevene(self, ds1, ds2, sigLev=.05):
-		"""
-		Levene 2 sample variance statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Levene 2 sample variance test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.levene(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
-		return result
-	def testTwoSampleVarFk(self, ds1, ds2, sigLev=.05):
-		"""
-		Fligner-Killeen 2 sample variance statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Fligner-Killeen 2 sample variance test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue = sta.fligner(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue)
-		self.__printStat(stat, pvalue, "probably same variance", "probably not same variance", sigLev)
-		return result
-	def testTwoSampleMedMood(self, ds1, ds2, sigLev=.05):
-		"""
-		Mood 2 sample median statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Mood 2 sample median test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat, pvalue, median, ctable = sta.median_test(data1, data2)
-		result = self.__printResult("stat", stat, "pvalue", pvalue, "median", median, "contigencyTable", ctable)
-		self.__printStat(stat, pvalue, "probably same median", "probably not same median", sigLev)
-		return result
-	def testTwoSampleZc(self, ds1, ds2, sigLev=.05):
-		"""
-		Zhang-C 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Zhang-C 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		l1 = len(data1)
-		l2 = len(data2)
-		l = l1 + l2
-		#find ranks
-		pooled = np.concatenate([data1, data2])
-		ranks = findRanks(data1, pooled)
-		ranks.extend(findRanks(data2, pooled))
-		s1 = 0.0
-		for i in range(1, l1+1):
-			s1 += math.log(l1 / (i - 0.5) - 1.0) * math.log(l / (ranks[i-1] - 0.5) - 1.0)
-		s2 = 0.0
-		for i in range(1, l2+1):
-			s2 += math.log(l2 / (i - 0.5) - 1.0) * math.log(l / (ranks[l1 + i - 1] - 0.5) - 1.0)
-		stat = (s1 + s2) / l
-		print(formatFloat(3, stat, "stat:"))
-		return stat
-	def testTwoSampleZa(self, ds1, ds2, sigLev=.05):
-		"""
-		Zhang-A 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Zhang-A 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		l1 = len(data1)
-		l2 = len(data2)
-		l = l1 + l2
-		pooled = np.concatenate([data1, data2])
-		cd1 = CumDistr(data1)
-		cd2 = CumDistr(data2)
-		sum = 0.0
-		for i in range(1, l+1):
-			v = pooled[i-1]
-			f1 = cd1.getDistr(v)
-			f2 = cd2.getDistr(v)
-			t1 = f1 * math.log(f1)
-			t2 = 0 if f1 == 1.0 else (1.0 - f1) * math.log(1.0 - f1)
-			sum += l1 * (t1 + t2) / ((i - 0.5) * (l - i + 0.5))
-			t1 = f2 * math.log(f2)
-			t2 = 0 if f2 == 1.0 else (1.0 - f2) * math.log(1.0 - f2)
-			sum += l2 * (t1 + t2) / ((i - 0.5) * (l - i + 0.5))
-		stat = -sum
-		print(formatFloat(3, stat, "stat:"))
-		return stat
-	def testTwoSampleZk(self, ds1, ds2, sigLev=.05):
-		"""
-		Zhang-K 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing Zhang-K 2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		l1 = len(data1)
-		l2 = len(data2)
-		l = l1 + l2
-		pooled = np.concatenate([data1, data2])
-		cd1 = CumDistr(data1)
-		cd2 = CumDistr(data2)
-		cd = CumDistr(pooled)
-		maxStat = None
-		for i in range(1, l+1):
-			v = pooled[i-1]
-			f1 = cd1.getDistr(v)
-			f2 = cd2.getDistr(v)
-			f = cd.getDistr(v)
-			t1 = 0 if f1 == 0 else f1 * math.log(f1 / f)
-			t2 = 0 if f1 == 1.0 else (1.0 - f1) * math.log((1.0 - f1) / (1.0 - f))
-			stat = l1 * (t1 + t2)
-			t1 = 0 if f2 == 0 else f2 * math.log(f2 / f)
-			t2 = 0 if f2 == 1.0 else (1.0 - f2) * math.log((1.0 - f2) / (1.0 - f))
-			stat += l2 * (t1 + t2)
-			if maxStat is None or stat > maxStat:
-				maxStat = stat
-		print(formatFloat(3, maxStat, "stat:"))
-		return maxStat
-	def testTwoSampleCvm(self, ds1, ds2, sigLev=.05):
-		"""
-		2 sample cramer von mises
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-			sigLev: statistical significance level
-		"""
-		self.__printBanner("doing 2 sample CVM test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		data = np.concatenate((data1,data2))
-		rdata = sta.rankdata(data)
-		n = len(data1)
-		m = len(data2)
-		l = n + m
-		s1 = 0
-		for i in range(n):
-			t = rdata[i] - (i+1)
-			s1 += (t * t)
-		s1 *= n
-		s2 = 0
-		for i in range(m):
-			t = rdata[i + n] - (i+1)
-			s2 += (t * t)
-		s2 *= m
-		u = s1 + s2
-		stat = u / (n * m * l) - (4 * m * n - 1) / (6 * l)
-		result = self.__printResult("stat", stat)
-		return result
-	def ensureSameSize(self, dlist):
-		"""
-		ensures all data sets are of same size
-		Parameters
-			dlist : data source list
-		"""
-		le = None
-		for d in dlist:
-			cle = len(d)
-			if le is None:
-				le = cle
-			else:
-				assert cle == le, "all data sets need to be of same size"
-	def testTwoSampleWasserstein(self, ds1, ds2):
-		"""
-		Wasserstein 2 sample statistic
-		Parameters
-			ds1: data set name or list or numpy array
-			ds2: data set name or list or numpy array
-		"""
-		self.__printBanner("doing Wasserstein distance2 sample test", ds1, ds2)
-		data1 = self.getNumericData(ds1)
-		data2 = self.getNumericData(ds2)
-		stat = sta.wasserstein_distance(data1, data2)
-		sd = np.std(np.concatenate([data1, data2]))
-		nstat = stat / sd
-		result = self.__printResult("stat", stat, "normalizedStat", nstat)
-		return result
-	def getMaxRelMinRedFeatures(self, fdst, tdst, nfeatures, nbins=20):
-		"""
-		get top n features based on max relevance and min redudancy	algorithm
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nfeatures : desired no of features
-			nbins : no of bins for numerical data
-		"""
-		self.__printBanner("doing max relevance min redundancy feature selection")
-		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "mrmr", nbins)
-	def getJointMutInfoFeatures(self, fdst, tdst, nfeatures, nbins=20):
-		"""
-		get top n features based on joint mutual infoormation	algorithm
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nfeatures : desired no of features
-			nbins : no of bins for numerical data
-		"""
-		self.__printBanner("doingjoint mutual info feature selection")
-		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "jmi", nbins)
-	def getCondMutInfoMaxFeatures(self, fdst, tdst, nfeatures, nbins=20):
-		"""
-		get top n features based on condition mutual information maximization algorithm
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nfeatures : desired no of features
-			nbins : no of bins for numerical data
-		"""
-		self.__printBanner("doing conditional mutual info max feature selection")
-		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "cmim", nbins)
-	def getInteractCapFeatures(self, fdst, tdst, nfeatures, nbins=20):
-		"""
-		get top n features based on interaction capping algorithm
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nfeatures : desired no of features
-			nbins : no of bins for numerical data
-		"""
-		self.__printBanner("doing interaction capped feature selection")
-		return self.getMutInfoFeatures(fdst, tdst, nfeatures, "icap", nbins)
-	def getMutInfoFeatures(self, fdst, tdst, nfeatures, algo, nbins=20):
-		"""
-		get top n features based on various mutual information	based algorithm
-		ref: Conditional likelihood maximisation : A unifying framework for information
-		theoretic feature selection, Gavin Brown
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nfeatures : desired no of features
-			algo: mi based feature selection algorithm
-			nbins : no of bins for numerical data
-		"""
-		#verify data source types types
-		le = len(fdst)
-		nfeatGiven = int(le / 2)
-		assertGreater(nfeatGiven, nfeatures, "no of features should be greater than no of features to be selected")
-		fds = list()
-		types = ["num", "cat"]
-		for i in range (0, le, 2):
-			ds = fdst[i]
-			dt = fdst[i+1]
-			assertInList(dt, types, "invalid type for data source " + dt)
-			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
-			p =(ds, dt)
-			fds.append(p)
-		algos = ["mrmr", "jmi", "cmim", "icap"]
-		assertInList(algo, algos, "invalid feature selection algo " + algo)
-		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
-		data = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
-		#print(fds)
-		sfds = list()
-		selected = set()
-		relevancies = dict()
-		for i in range(nfeatures):
-			#print(i)
-			scorem = None
-			dsm = None
-			dsmt = None
-			for ds, dt in fds:
-				#print(ds, dt)
-				if ds not in selected:
-					#relevancy
-					if ds in relevancies:
-						mutInfo = relevancies[ds]
-					else:
-						mutInfo = self.getMutualInfo([ds, dt,  tdst[0], tdst[1]], nbins)["mutInfo"]
-						relevancies[ds] = mutInfo
-					relev = mutInfo
-					#print("relev", relev)
-					#redundancy
-					smi = 0
-					reds = list()
-					for sds, sdt, _ in sfds:
-						#print(sds, sdt)
-						mutInfo = self.getMutualInfo([ds, dt,  sds, sdt], nbins)["mutInfo"]
-						mutInfoCnd = self.getCondMutualInfo([ds, dt,  sds, sdt, tdst[0], tdst[1]], nbins)["condMutInfo"] \
-						if algo != "mrmr" else 0
-						red = mutInfo - mutInfoCnd
-						reds.append(red)
-					if algo == "mrmr" or algo == "jmi":
-						redun = sum(reds) / len(sfds) if len(sfds) > 0 else 0
-					elif algo == "cmim" or algo == "icap":
-						redun = max(reds) if len(sfds) > 0 else 0
-						if algo == "icap":
-							redun = max(0, redun)
-					#print("redun", redun)
-					score = relev - redun
-					if scorem is None or score > scorem:
-						scorem = score
-						dsm = ds
-						dsmt = dt
-			pa = (dsm, dsmt, scorem)
-			#print(pa)
-			sfds.append(pa)
-			selected.add(dsm)
-		selFeatures = list(map(lambda r : (r[0], r[2]), sfds))
-		result = self.__printResult("selFeatures", selFeatures)
-		return result
-	def getFastCorrFeatures(self, fdst, tdst, delta, nbins=20):
-		"""
-		get top features based on Fast Correlation Based Filter (FCBF)
-		ref: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution
-		Lei Yu
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			delta : feature, target correlation threshold
-			nbins : no of bins for numerical data
-		"""
-		le = len(fdst)
-		nfeatGiven = int(le / 2)
-		fds = list()
-		types = ["num", "cat"]
-		for i in range (0, le, 2):
-			ds = fdst[i]
-			dt = fdst[i+1]
-			assertInList(dt, types, "invalid type for data source " + dt)
-			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
-			p =(ds, dt)
-			fds.append(p)
-		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
-		data = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
-		# get features with symetric uncertainty above threshold
-		tentr = self.getAnyEntropy(tdst[0], tdst[1], nbins)["entropy"]
-		rfeatures = list()
-		fentrs = dict()
-		for ds, dt in fds:
-			mutInfo = self.getMutualInfo([ds, dt,  tdst[0], tdst[1]], nbins)["mutInfo"]
-			fentr = self.getAnyEntropy(ds, dt, nbins)["entropy"]
-			sunc = 2 * mutInfo / (tentr + fentr)
-			#print("ds {}  sunc {:.3f}".format(ds, sunc))
-			if sunc >= delta:
-				f = [ds, dt, sunc, False]
-				rfeatures.append(f)
-				fentrs[ds] = fentr
-		# sort descending of sym uncertainty
-		rfeatures.sort(key=lambda e : e[2], reverse=True)
-		#disccard redundant features
-		le = len(rfeatures)
-		for i in range(le):
-			if rfeatures[i][3]:
-				continue
-			for j in range(i+1, le, 1):
-				if rfeatures[j][3]:
-					continue
-				mutInfo = self.getMutualInfo([rfeatures[i][0], rfeatures[i][1],  rfeatures[j][0], rfeatures[j][1]], nbins)["mutInfo"]
-				sunc  = 2 * mutInfo / (fentrs[rfeatures[i][0]] + fentrs[rfeatures[j][0]])
-				if sunc >= rfeatures[j][2]:
-					rfeatures[j][3] = True
-		frfeatures = list(filter(lambda f : not f[3], rfeatures))
-		selFeatures = list(map(lambda f : [f[0], f[2]], frfeatures))
-		result = self.__printResult("selFeatures", selFeatures)
-		return result
-	def getInfoGainFeatures(self, fdst, tdst, nfeatures, nsplit, nbins=20):
-		"""
-		get top n features based on information gain or entropy loss
-		Parameters
-			fdst: list of pair of data set name or list or numpy array and data type
-			tdst: target data set name or list or numpy array and data type (cat for classification num for regression)
-			nsplit : num of splits
-			nfeatures : desired no of features
-			nbins : no of bins for numerical data
-		"""
-		le = len(fdst)
-		nfeatGiven = int(le / 2)
-		assertGreater(nfeatGiven, nfeatures, "available features should be greater than desired")
-		fds = list()
-		types = ["num", "cat"]
-		for i in range (0, le, 2):
-			ds = fdst[i]
-			dt = fdst[i+1]
-			assertInList(dt, types, "invalid type for data source " + dt)
-			data = self.getNumericData(ds) if dt == "num" else self.getCatData(ds)
-			p =(ds, dt)
-			fds.append(p)
-		assertInList(tdst[1], types, "invalid type for data source " + tdst[1])
-		assertGreater(nsplit, 3, "minimum 4 splits necessary")
-		tdata = self.getNumericData(tdst[0]) if tdst[1] == "num" else self.getCatData(tdst[0])
-		tentr = self.getAnyEntropy(tdst[0], tdst[1], nbins)["entropy"]
-		sz =len(tdata)
-		sfds = list()
-		for ds, dt in fds:
-			#print(ds, dt)
-			if dt == "num":
-				fd = self.getNumericData(ds)
-				_ , _ , vmax, vmin = self.__getBasicStats(fd)
-				intv = (vmax - vmin) / nsplit
-				maxig = None
-				spmin = vmin + intv
-				spmax = vmax - 0.9 * intv
-				#iterate all splits
-				for sp in np.arange(spmin, spmax, intv):
-					ltvals = list()
-					gevals = list()
-					for i in range(len(fd)):
-						if fd[i] < sp:
-							ltvals.append(tdata[i])
-						else:
-							gevals.append(tdata[i])
-					self.addListNumericData(ltvals, "spds") if tdst[1] == "num" else self.addListCatData(ltvals, "spds")
-					lten = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
-					self.addListNumericData(gevals, "spds") if tdst[1] == "num" else self.addListCatData(gevals, "spds")
-					geen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
-					#info gain
-					ig = tentr - (len(ltvals) * lten / sz + len(gevals) * geen / sz)
-					if maxig is None or ig > maxig:
-						maxig = ig
-				pa = (ds, maxig)
-				sfds.append(pa)
-			else:
-				fd = self.getCatData(ds)
-				fds = set(fd)
-				fdps = genPowerSet(fds)
-				maxig = None
-				#iterate all subsets
-				for s in fdps:
-					if len(s) == len(fds):
-						continue
-					invals = list()
-					exvals = list()
-					for i in range(len(fd)):
-						if fd[i] in s:
-							invals.append(tdata[i])
-						else:
-							exvals.append(tdata[i])
-					self.addListNumericData(invals, "spds") if tdst[1] == "num" else self.addListCatData(invals, "spds")
-					inen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
-					self.addListNumericData(exvals, "spds") if tdst[1] == "num" else self.addListCatData(exvals, "spds")
-					exen = self.getAnyEntropy("spds", tdst[1], nbins)["entropy"]
-					ig = tentr - (len(invals) * inen / sz + len(exvals) * exen / sz)
-					if maxig is None or ig > maxig:
-						maxig = ig
-				pa = (ds, maxig)
-				sfds.append(pa)
-		#sort of info gain
-		sfds.sort(key = lambda v : v[1], reverse = True)
-		result = self.__printResult("selFeatures", sfds[:nfeatures])
-		return result
-	def __stackData(self, *dsl):
-		"""
-		stacks collumd to create matrix
-		Parameters
-			dsl: data source list
-		"""
-		dlist = tuple(map(lambda ds : self.getNumericData(ds), dsl))
-		self.ensureSameSize(dlist)
-		dmat = np.column_stack(dlist)
-		return dmat
-	def __printBanner(self, msg, *dsl):
-		"""
-		print banner for any function
-		Parameters
-			msg: message
-			dsl: list of data set name or list or numpy array
-		"""
-		tags = list(map(lambda ds : ds if type(ds) == str else "annoynymous", dsl))
-		forData = " for data sets " if tags else ""
-		msg = msg + forData + " ".join(tags)
-		if self.verbose:
-			print("\n== " + msg + " ==")
-	def __printDone(self):
-		"""
-		print banner for any function
-		"""
-		if self.verbose:
-			print("done")
-	def __printStat(self, stat, pvalue, nhMsg, ahMsg, sigLev=.05):
-		"""
-		generic stat and pvalue output
-		Parameters
-			stat : stat value
-			pvalue : p value
-			nhMsg : null hypothesis violation message
-			ahMsg : null hypothesis  message
-			sigLev : significance level
-		"""
-		if self.verbose:
-			print("\ntest result:")
-			print("stat:   {:.3f}".format(stat))
-			print("pvalue: {:.3f}".format(pvalue))
-			print("significance level: {:.3f}".format(sigLev))
-			print(nhMsg if pvalue > sigLev else ahMsg)
-	def __printResult(self,  *values):
-		"""
-		print results
-		Parameters
-			values : flattened kay and value pairs
-		"""
-		result = dict()
-		assert len(values) % 2 == 0, "key value list should have even number of items"
-		for i in range(0, len(values), 2):
-			result[values[i]] = values[i+1]
-		if self.verbose:
-			print("result details:")
-			self.pp.pprint(result)
-		return result
-	def __getBasicStats(self, data):
-		"""
-		get mean and std dev
-		Parameters
-			data : numpy array
-		"""
-		mean = np.average(data)
-		sd = np.std(data)
-		r = (mean, sd, np.max(data), np.min(data))
-		return r

matumizi/matumizi/mcsim.py DELETED Viewed

@@ -1,552 +0,0 @@
-#!/usr/local/bin/python3
-# avenir-python: Machine Learning
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import matplotlib.pyplot as plt
-import numpy as np
-import matplotlib
-import random
-import jprops
-import statistics
-from matplotlib import pyplot
-from .util import *
-from .mlutil import *
-from .sampler import *
-class MonteCarloSimulator(object):
-	"""
-	monte carlo simulator for intergation, various statistic for complex fumctions
-	"""
-	def __init__(self, numIter, callback, logFilePath, logLevName):
-		"""
-		constructor
-		Parameters
-			numIter :num of iterations
-			callback : call back method
-			logFilePath : log file path
-			logLevName : log level
-		"""
-		self.samplers = list()
-		self.numIter = numIter;
-		self.callback = callback
-		self.extraArgs = None
-		self.output = list()
-		self.sum = None
-		self.mean = None
-		self.sd = None
-		self.replSamplers = dict()
-		self.prSamples = None
-		self.logger = None
-		if logFilePath is not None:
-			self.logger = createLogger(__name__, logFilePath, logLevName)
-			self.logger.info("******** stating new  session of MonteCarloSimulator")
-	def registerBernoulliTrialSampler(self, pr):
-		"""
-		bernoulli trial sampler
-		Parameters
-			pr : probability
-		"""
-		self.samplers.append(BernoulliTrialSampler(pr))
-	def registerPoissonSampler(self, rateOccur, maxSamp):
-		"""
-		poisson sampler
-		Parameters
-			rateOccur : rate of occurence
-			maxSamp : max limit on no of samples
-		"""
-		self.samplers.append(PoissonSampler(rateOccur, maxSamp))
-	def registerUniformSampler(self, minv, maxv):
-		"""
-		uniform sampler
-		Parameters
-			minv : min value
-			maxv : max value
-		"""
-		self.samplers.append(UniformNumericSampler(minv, maxv))
-	def registerTriangularSampler(self, min, max, vertexValue, vertexPos=None):
-		"""
-		triangular sampler
-		Parameters
-			xmin : min  value
-			xmax : max  value
-			vertexValue : distr value at vertex
-			vertexPos : vertex pposition
-		"""
-		self.samplers.append(TriangularRejectSampler(min, max, vertexValue, vertexPos))
-	def registerGaussianSampler(self, mean, sd):
-		"""
-		gaussian sampler
-		Parameters
-			mean : mean
-			sd : std deviation
-		"""
-		self.samplers.append(GaussianRejectSampler(mean, sd))
-	def registerNormalSampler(self, mean, sd):
-		"""
-		gaussian sampler using numpy
-		Parameters
-			mean : mean
-			sd : std deviation
-		"""
-		self.samplers.append(NormalSampler(mean, sd))
-	def registerLogNormalSampler(self, mean, sd):
-		"""
-		log normal sampler using numpy
-		Parameters
-			mean : mean
-			sd : std deviation
-		"""
-		self.samplers.append(LogNormalSampler(mean, sd))
-	def registerParetoSampler(self, mode, shape):
-		"""
-		pareto sampler using numpy
-		Parameters
-			mode : mode
-			shape : shape
-		"""
-		self.samplers.append(ParetoSampler(mode, shape))
-	def registerGammaSampler(self, shape, scale):
-		"""
-		gamma sampler using numpy
-		Parameters
-			shape : shape
-			scale : scale
-		"""
-		self.samplers.append(GammaSampler(shape, scale))
-	def registerDiscreteRejectSampler(self, xmin, xmax, step, *values):
-		"""
-		disccrete int sampler
-		Parameters
-			xmin : min  value
-			xmax : max  value
-			step : discrete step
-			values : distr values
-		"""
-		self.samplers.append(DiscreteRejectSampler(xmin, xmax, step, *values))
-	def registerNonParametricSampler(self, minv, binWidth, *values):
-		"""
-		nonparametric sampler
-		Parameters
-			xmin : min  value
-			binWidth : bin width
-			values : distr values
-		"""
-		sampler = NonParamRejectSampler(minv, binWidth, *values)
-		sampler.sampleAsFloat()
-		self.samplers.append(sampler)
-	def registerMultiVarNormalSampler(self,  numVar, *values):
-		"""
-		multi var gaussian sampler using numpy
-		Parameters
-			numVar : no of variables
-			values : numVar mean values followed by numVar x numVar values for covar matrix
-		"""
-		self.samplers.append(MultiVarNormalSampler(numVar, *values))
-	def registerJointNonParamRejectSampler(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):
-		"""
-		joint nonparametric sampler
-		Parameters
-			xmin : min  value for x
-			xbinWidth : bin width for x
-			xnbin : no of bins for x
-			ymin : min  value for y
-			ybinWidth : bin width for y
-			ynbin : no of bins for y
-			values : distr values
-		"""
-		self.samplers.append(JointNonParamRejectSampler(xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values))
-	def registerRangePermutationSampler(self, minv, maxv, *numShuffles):
-		"""
-		permutation sampler with range
-		Parameters
-			minv : min of range
-			maxv : max of range
-			numShuffles : no of shuffles or range of no of shuffles
-		"""
-		self.samplers.append(PermutationSampler.createSamplerWithRange(minv, maxv, *numShuffles))
-	def registerValuesPermutationSampler(self, values, *numShuffles):
-		"""
-		permutation sampler with values
-		Parameters
-			values : list data
-			numShuffles : no of shuffles or range of no of shuffles
-		"""
-		self.samplers.append(PermutationSampler.createSamplerWithValues(values, *numShuffles))
-	def registerNormalSamplerWithTrendCycle(self, mean, stdDev, trend, cycle,  step=1):
-		"""
-		normal sampler with trend and cycle
-		Parameters
-			mean : mean
-			stdDev : std deviation
-			dmean : trend delta
-			cycle : cycle values wrt base mean
-			step : adjustment step for cycle and trend
-		"""
-		self.samplers.append(NormalSamplerWithTrendCycle(mean, stdDev, trend, cycle,  step))
-	def registerCustomSampler(self, sampler):
-		"""
-		eventsampler
-		Parameters
-			sampler : sampler with sample() method
-		"""
-		self.samplers.append(sampler)
-	def registerEventSampler(self, intvSampler, valSampler=None):
-		"""
-		event sampler
-		Parameters
-			intvSampler : interval sampler
-			valSampler : value sampler
-		"""
-		self.samplers.append(EventSampler(intvSampler, valSampler))
-	def registerMetropolitanSampler(self, propStdDev, minv, binWidth, values):
-		"""
-		metropolitan sampler
-		Parameters
-			propStdDev : proposal distr std dev
-			minv : min domain value for target distr
-			binWidth : bin width
-			values : target distr values
-		"""
-		self.samplers.append(MetropolitanSampler(propStdDev, minv, binWidth, values))
-	def setSampler(self, var, iter, sampler):
-		"""
-		set sampler for some variable when iteration reaches certain point
-		Parameters
-			var : sampler index
-			iter : iteration count
-			sampler : new sampler
-		"""
-		key = (var, iter)
-		self.replSamplers[key] = sampler
-	def registerExtraArgs(self, *args):
-		"""
-		extra args
-		Parameters
-			args : extra argument list
-		"""
-		self.extraArgs = args
-	def replSampler(self, iter):
-		"""
-		replace samper for this iteration
-		Parameters
-			iter : iteration number
-		"""
-		if len(self.replSamplers) > 0:
-			for v in range(self.numVars):
-				key = (v, iter)
-				if key in self.replSamplers:
-					sampler = self.replSamplers[key]
-					self.samplers[v] = sampler
-	def run(self):
-		"""
-		run simulator
-		"""
-		self.sum = None
-		self.mean = None
-		self.sd = None
-		self.numVars = len(self.samplers)
-		vOut = 0
-		#print(formatAny(self.numIter, "num iterations"))
-		for i in range(self.numIter):
-			self.replSampler(i)
-			args = list()
-			for s in self.samplers:
-				arg = s.sample()
-				if type(arg) is list:
-					args.extend(arg)
-				else:
-					args.append(arg)
-			slen = len(args)
-			if self.extraArgs:
-				args.extend(self.extraArgs)
-			args.append(self)
-			args.append(i)
-			vOut = self.callback(args)
-			self.output.append(vOut)
-			self.prSamples = args[:slen]
-	def getOutput(self):
-		"""
-		get raw output
-		"""
-		return self.output
-	def setOutput(self, values):
-		"""
-		set raw output
-		Parameters
-			values : output values
-		"""
-		self.output = values
-		self.numIter = len(values)
-	def drawHist(self, myTitle, myXlabel, myYlabel):
-		"""
-		draw histogram
-		Parameters
-			myTitle : title
-			myXlabel : label for x
-			myYlabel : label for y
-		"""
-		pyplot.hist(self.output, density=True)
-		pyplot.title(myTitle)
-		pyplot.xlabel(myXlabel)
-		pyplot.ylabel(myYlabel)
-		pyplot.show()
-	def getSum(self):
-		"""
-		get sum
-		"""
-		if not self.sum:
-			self.sum = sum(self.output)
-		return self.sum
-	def getMean(self):
-		"""
-		get average
-		"""
-		if self.mean is None:
-			self.mean = statistics.mean(self.output)
-		return self.mean
-	def getStdDev(self):
-		"""
-		get std dev
-		"""
-		if self.sd is None:
-			self.sd = statistics.stdev(self.output, xbar=self.mean) if self.mean else statistics.stdev(self.output)
-		return self.sd
-	def getMedian(self):
-		"""
-		get average
-		"""
-		med = statistics.median(self.output)
-		return med
-	def getMax(self):
-		"""
-		get max
-		"""
-		return max(self.output)
-	def getMin(self):
-		"""
-		get min
-		"""
-		return min(self.output)
-	def getIntegral(self, bounds):
-		"""
-		integral
-		Parameters
-			bounds :  bound on sum
-		"""
-		if not self.sum:
-			self.sum = sum(self.output)
-		return self.sum * bounds / self.numIter
-	def getLowerTailStat(self, zvalue, numIntPoints=50):
-		"""
-		get lower tail stat
-		Parameters
-			zvalue : zscore upper bound
-			numIntPoints : no of interpolation point for cum distribution
-		"""
-		mean = self.getMean()
-		sd = self.getStdDev()
-		tailStart = self.getMin()
-		tailEnd = mean - zvalue * sd
-		cvaCounts = self.cumDistr(tailStart, tailEnd, numIntPoints)
-		reqConf = floatRange(0.0, 0.150, .01)
-		msg = "p value outside interpolation range, reduce zvalue and try again {:.5f}  {:.5f}".format(reqConf[-1], cvaCounts[-1][1])
-		assert reqConf[-1] < cvaCounts[-1][1], msg
-		critValues = self.interpolateCritValues(reqConf, cvaCounts, True, tailStart, tailEnd)
-		return critValues
-	def getPercentile(self, cvalue):
-		"""
-		percentile
-		Parameters
-			cvalue : value for percentile
-		"""
-		count = 0
-		for v in self.output:
-			if v < cvalue:
-				count += 1
-		percent =  int(count * 100.0 / self.numIter)
-		return percent
-	def getCritValue(self, pvalue):
-		"""
-		critical value for probabaility threshold
-		Parameters
-			pvalue : pvalue
-		"""
-		assertWithinRange(pvalue, 0.0, 1.0, "invalid probabaility value")
-		svalues = self.output.sorted()
-		ppval = None
-		cpval = None
-		intv = 1.0 / (self.numIter - 1)
-		for i in range(self.numIter - 1):
-			cpval = (i + 1) / self.numIter
-			if cpval > pvalue:
-				sl = svalues[i] - svalues[i-1]
-				cval = svalues[i-1] + sl * (pvalue - ppval)
-				break
-			ppval = cpval
-		return cval
-	def getUpperTailStat(self, zvalue, numIntPoints=50):
-		"""
-		upper tail stat
-		Parameters
-			zvalue : zscore upper bound
-			numIntPoints : no of interpolation point for cum distribution
-		"""
-		mean = self.getMean()
-		sd = self.getStdDev()
-		tailStart = mean + zvalue * sd
-		tailEnd = self.getMax()
-		cvaCounts = self.cumDistr(tailStart, tailEnd, numIntPoints)
-		reqConf = floatRange(0.85, 1.0, .01)
-		msg = "p value outside interpolation range, reduce zvalue and try again {:.5f}  {:.5f}".format(reqConf[0], cvaCounts[0][1])
-		assert reqConf[0] > cvaCounts[0][1],  msg
-		critValues = self.interpolateCritValues(reqConf, cvaCounts, False, tailStart, tailEnd)
-		return critValues
-	def cumDistr(self, tailStart, tailEnd, numIntPoints):
-		"""
-		cumulative distribution at tail
-		Parameters
-			tailStart : tail start
-			tailEnd : tail end
-			numIntPoints : no of interpolation points
-		"""
-		delta = (tailEnd - tailStart) / numIntPoints
-		cvalues = floatRange(tailStart, tailEnd, delta)
-		cvaCounts = list()
-		for cv in cvalues:
-			count = 0
-			for v in self.output:
-				if v < cv:
-					count += 1
-			p = (cv, count/self.numIter)
-			if self.logger is not None:
-				self.logger.info("{:.3f}  {:.3f}".format(p[0], p[1]))
-			cvaCounts.append(p)
-		return cvaCounts
-	def interpolateCritValues(self, reqConf, cvaCounts, lowertTail, tailStart, tailEnd):
-		"""
-		interpolate for spefici confidence limits
-		Parameters
-			reqConf : confidence level values
-			cvaCounts : cum values
-			lowertTail : True if lower tail
-			tailStart ; tail start
-			tailEnd : tail end
-		"""
-		critValues = list()
-		if self.logger is not None:
-			self.logger.info("target conf limit " + str(reqConf))
-		reqConfSub = reqConf[1:] if lowertTail else reqConf[:-1]
-		for rc in reqConfSub:
-			for i in range(len(cvaCounts) -1):
-				if rc >= cvaCounts[i][1] and rc < cvaCounts[i+1][1]:
-					#print("interpoltate between " + str(cvaCounts[i])  +  " and " + str(cvaCounts[i+1]))
-					slope = (cvaCounts[i+1][0] - cvaCounts[i][0]) / (cvaCounts[i+1][1] - cvaCounts[i][1])
-					cval = cvaCounts[i][0] + slope * (rc - cvaCounts[i][1])
-					p = (rc, cval)
-					if self.logger is not None:
-						self.logger.debug("interpolated crit values {:.3f} {:.3f}".format(p[0], p[1]))
-					critValues.append(p)
-					break
-		if lowertTail:
-			p = (0.0, tailStart)
-			critValues.insert(0, p)
-		else:
-			p = (1.0, tailEnd)
-			critValues.append(p)
-		return critValues

matumizi/matumizi/mlutil.py DELETED Viewed

@@ -1,1500 +0,0 @@
-#!/usr/local/bin/python3
-# avenir-python: Machine Learning
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-# Package imports
-import os
-import sys
-import numpy as np
-from sklearn import preprocessing
-from sklearn import metrics
-from sklearn.datasets import make_blobs
-from sklearn.datasets import make_classification
-import random
-from math import *
-from decimal import Decimal
-import statistics
-import jprops
-from Levenshtein import distance as ld
-from .util import *
-from .sampler import *
-class Configuration:
-	"""
-	Configuration management. Supports default value, mandatory value and typed value.
-	"""
-	def __init__(self, configFile, defValues, verbose=False):
-		"""
-		initializer
-		Parameters
-			configFile : config file path
-			defValues : dictionary of default values
-			verbose : verbosity flag
-		"""
-		configs = {}
-		with open(configFile) as fp:
-  			for key, value in jprops.iter_properties(fp):
-  				configs[key] = value
-		self.configs = configs
-		self.defValues = defValues
-		self.verbose = verbose
-	def override(self, configFile):
-		"""
-		over ride configuration from file
-		Parameters
-			configFile : override config file path
-		"""
-		with open(configFile) as fp:
-  			for key, value in jprops.iter_properties(fp):
-  				self.configs[key] = value
-	def setParam(self, name, value):
-		"""
-		override individual configuration
-		Parameters
-			name : config param name
-			value : config param value
-		"""
-		self.configs[name] = value
-	def getStringConfig(self, name):
-		"""
-		get string param
-		Parameters
-			name : config param name
-		"""
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			val = (self.configs[name], False)
-		if self.verbose:
-			print( "{} {} {}".format(name, self.configs[name], val[0]))
-		return val
-	def getIntConfig(self, name):
-		"""
-		get int param
-		Parameters
-			name : config param name
-		"""
-		#print "%s %s" %(name,self.configs[name])
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			val = (int(self.configs[name]), False)
-		if self.verbose:
-			print( "{} {} {}".format(name, self.configs[name], val[0]))
-		return val
-	def getFloatConfig(self, name):
-		"""
-		get float param
-		Parameters
-			name : config param name
-		"""
-		#print "%s %s" %(name,self.configs[name])
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			val = (float(self.configs[name]), False)
-		if self.verbose:
-			print( "{} {} {:06.3f}".format(name, self.configs[name], val[0]))
-		return val
-	def getBooleanConfig(self, name):
-		"""
-		#get boolean param
-		Parameters
-			name : config param name
-		"""
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			bVal = self.configs[name].lower() == "true"
-			val = (bVal, False)
-		if self.verbose:
-			print( "{} {} {}".format(name, self.configs[name], val[0]))
-		return val
-	def getIntListConfig(self, name, delim=","):
-		"""
-		get int list param
-		Parameters
-			name : config param name
-			delim : delemeter
-		"""
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			delSepStr = self.getStringConfig(name)
-			#specified as range
-			intList = strListOrRangeToIntArray(delSepStr[0])
-			val =(intList, delSepStr[1])
-		return val
-	def getFloatListConfig(self, name, delim=","):
-		"""
-		get float list param
-		Parameters
-			name : config param name
-			delim : delemeter
-		"""
-		delSepStr = self.getStringConfig(name)
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			flList = strToFloatArray(delSepStr[0], delim)
-			val =(flList, delSepStr[1])
-		return val
-	def getStringListConfig(self, name, delim=","):
-		"""
-		get string list param
-		Parameters
-			name : config param name
-			delim : delemeter
-		"""
-		delSepStr = self.getStringConfig(name)
-		if self.isNone(name):
-			val = (None, False)
-		elif self.isDefault(name):
-			val = (self.handleDefault(name), True)
-		else:
-			strList = delSepStr[0].split(delim)
-			val = (strList, delSepStr[1])
-		return val
-	def handleDefault(self, name):
-		"""
-		handles default
-		Parameters
-			name : config param name
-		"""
-		dVal = self.defValues[name]
-		if (dVal[1] is None):
-			val = dVal[0]
-		else:
-			raise ValueError(dVal[1])
-		return val
-	def isNone(self, name):
-		"""
-		true is value is None
-		Parameters
-			name : config param name
-		"""
-		return self.configs[name].lower() == "none"
-	def isDefault(self, name):
-		"""
-		true if the value is default
-		Parameters
-			name : config param name
-		"""
-		de = self.configs[name] == "_"
-		#print de
-		return de
-	def eitherOrStringConfig(self, firstName, secondName):
-		"""
-		returns one of two string parameters
-		Parameters
-			firstName : first parameter name
-			secondName : second parameter name
-		"""
-		if not self.isNone(firstName):
-			first = self.getStringConfig(firstName)[0]
-			second = None
-			if not self.isNone(secondName):
-				raise ValueError("only one of the two parameters should be set and not both " + firstName + "  " + secondName)
-		else:
-			if not self.isNone(secondName):
-				second = self.getStringConfig(secondtName)[0]
-				first = None
-			else:
-				raise ValueError("at least one of the two parameters should be set " + firstName + "  " + secondName)
-		return (first, second)
-	def eitherOrIntConfig(self, firstName, secondName):
-		"""
-		returns one of two int parameters
-		Parameters
-			firstName : first parameter name
-			secondName : second parameter name
-		"""
-		if not self.isNone(firstName):
-			first = self.getIntConfig(firstName)[0]
-			second = None
-			if not self.isNone(secondName):
-				raise ValueError("only one of the two parameters should be set and not both " + firstName + "  " + secondName)
-		else:
-			if not self.isNone(secondName):
-				second = self.getIntConfig(secondsName)[0]
-				first = None
-			else:
-				raise ValueError("at least one of the two parameters should be set " + firstName + "  " + secondName)
-		return (first, second)
-class CatLabelGenerator:
-	"""
-	label generator for categorical variables
-	"""
-	def __init__(self,  catValues, delim):
-		"""
-		initilizers
-		Parameters
-			catValues : dictionary of categorical values
-			delim : delemeter
-		"""
-		self.encoders = {}
-		self.catValues = catValues
-		self.delim = delim
-		for k in self.catValues.keys():
-			le = preprocessing.LabelEncoder()
-			le.fit(self.catValues[k])
-			self.encoders[k] = le
-	def processRow(self, row):
-		"""
-		encode row categorical values
-		Parameters:
-			row : data row
-		"""
-		#print row
-		rowArr = row.split(self.delim)
-		for i in range(len(rowArr)):
-			if (i in self.catValues):
-				curVal = rowArr[i]
-				assert curVal in self.catValues[i], "categorival value invalid"
-				encVal = self.encoders[i].transform([curVal])
-				rowArr[i] = str(encVal[0])
-		return self.delim.join(rowArr)
-	def getOrigLabels(self, indx):
-		"""
-		get original labels
-		Parameters:
-			indx : column index
-		"""
-		return self.encoders[indx].classes_
-class SupvLearningDataGenerator:
-	"""
-	data generator for supervised learning
-	"""
-	def __init__(self,  configFile):
-		"""
-		initilizers
-		Parameters
-			configFile : config file path
-		"""
-		defValues = dict()
-		defValues["common.num.samp"] = (100, None)
-		defValues["common.num.feat"] = (5, None)
-		defValues["common.feat.trans"] = (None, None)
-		defValues["common.feat.types"] = (None, "missing feature types")
-		defValues["common.cat.feat.distr"] = (None, None)
-		defValues["common.output.precision"] = (3, None)
-		defValues["common.error"] = (0.01, None)
-		defValues["class.gen.technique"] = ("blob", None)
-		defValues["class.num.feat.informative"] = (2, None)
-		defValues["class.num.feat.redundant"] = (2, None)
-		defValues["class.num.feat.repeated"] = (0, None)
-		defValues["class.num.feat.cat"] = (0, None)
-		defValues["class.num.class"] = (2, None)
-		self.config = Configuration(configFile, defValues)
-	def genClassifierData(self):
-		"""
-		generates classifier data
-		"""
-		nsamp =  self.config.getIntConfig("common.num.samp")[0]
-		nfeat =  self.config.getIntConfig("common.num.feat")[0]
-		nclass =  self.config.getIntConfig("class.num.class")[0]
-		#transform with shift and scale
-		ftrans =  self.config.getFloatListConfig("common.feat.trans")[0]
-		feTrans = dict()
-		for i in range(0, len(ftrans), 2):
-			tr = (ftrans[i], ftrans[i+1])
-			indx = int(i/2)
-			feTrans[indx] = tr
-		ftypes =  self.config.getStringListConfig("common.feat.types")[0]
-		# categorical feature distribution
-		feCatDist = dict()
-		fcatdl =  self.config.getStringListConfig("common.cat.feat.distr")[0]
-		for fcatds in fcatdl:
-			fcatd = fcatds.split(":")
-			feInd =  int(fcatd[0])
-			clVal =  int(fcatd[1])
-			key = (feInd, clVal)		#feature index and class value
-			dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))
-			feCatDist[key] = CategoricalRejectSampler(*dist)
-		#shift and scale
-		genTechnique = self.config.getStringConfig("class.gen.technique")[0]
-		error = self.config.getFloatConfig("common.error")[0]
-		if genTechnique == "blob":
-			features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)
-			for i in range(nsamp):			#shift and scale
-				for j in range(nfeat):
-					tr = feTrans[j]
-					features[i,j] = (features[i,j]  + tr[0]) * tr[1]
-			claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))
-		elif genTechnique == "classify":
-			nfeatInfo =  self.config.getIntConfig("class.num.feat.informative")[0]
-			nfeatRed =  self.config.getIntConfig("class.num.feat.redundant")[0]
-			nfeatRep =  self.config.getIntConfig("class.num.feat.repeated")[0]
-			shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))
-			scales = list(map(lambda i : feTrans[i][1], range(nfeat)))
-			features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed,
-			n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)
-		else:
-			raise "invalid genaration technique"
-		# add categorical features and format
-		nCatFeat = self.config.getIntConfig("class.num.feat.cat")[0]
-		prec =  self.config.getIntConfig("common.output.precision")[0]
-		for f , c in zip(features, claz):
-			nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))
-			if nCatFeat > 0:
-				cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))
-				rec = ",".join(nfs) + "," +  ",".join(cfs)  + "," + str(c)
-			else:
-				rec = ",".join(nfs)  + "," + str(c)
-			yield rec
-	def numFeToStr(self, fv, ft, prec):
-		"""
-		nummeric feature value to string
-		Parameters
-			fv : field value
-			ft : field data type
-			prec : precision
-		"""
-		if ft == "float":
-			s = formatFloat(prec, fv)
-		elif ft =="int":
-			s = str(int(fv))
-		else:
-			raise "invalid type expecting float or int"
-		return s
-	def catFe(self, i, cv, ft, feCatDist):
-		"""
-		generate categorical feature
-		Parameters
-			i : col index
-			cv : class value
-			ft : field data type
-			feCatDist : cat value distribution
-		"""
-		if ft == "cat":
-			key = (i, cv)
-			s = feCatDist[key].sample()
-		else:
-			raise "invalid type expecting categorical"
-		return s
-class RegressionDataGenerator:
-	"""
-	data generator for regression, including square terms, cross terms, bias, noise, correlated variables
-	and user defined function
-	"""
-	def __init__(self,  configFile, callback=None):
-		"""
-		initilizers
-		Parameters
-			configFile : config file path
-			callback : user defined function
-		"""
-		defValues = dict()
-		defValues["common.pvar.samplers"] = (None, None)
-		defValues["common.pvar.ranges"] = (None, None)
-		defValues["common.linear.weights"] = (None, None)
-		defValues["common.square.weights"] = (None, None)
-		defValues["common.crterm.weights"] = (None, None)
-		defValues["common.corr.params"] = (None, None)
-		defValues["common.bias"] = (0, None)
-		defValues["common.noise"] = (None, None)
-		defValues["common.tvar.range"] = (None, None)
-		defValues["common.weight.niter"] = (20, None)
-		self.config = Configuration(configFile, defValues)
-		self.callback = callback
-		#samplers for predictor variables
-		items = self.config.getStringListConfig("common.pvar.samplers")[0]
-		self.samplers = list(map(lambda s : createSampler(s), items))
-		self.npvar = len(self.samplers)
-		#values range  for predictor variables
-		items = self.config.getStringListConfig("common.pvar.ranges")[0]
-		self.pvranges = list()
-		for i in range(0, len(items), 2):
-			if 	items[i] =="none":
-				r = None
-			else:
-				vmin = float(items[i])
-				vmax = float(items[i+1])
-				r = (vmin, vmax, vmax-vmin)
-			self.pvranges.append(r)
-		assertEqual(len(self.pvranges), self.npvar, "no of predicatble var ranges provided is inavalid")
-		#linear weights for predictor variables
-		self.lweights = self.config.getFloatListConfig("common.linear.weights")[0]
-		assertEqual(len(self.lweights), self.npvar, "no of linear weights provided is inavalid")
-		#square weights for predictor variables
-		items = self.config.getStringListConfig("common.square.weights")[0]
-		self.sqweight = dict()
-		for i in range(0, len(items), 2):
-			vi = int(items[i])
-			assertLesser(vi, self.npvar, "invalid predictor var index")
-			wt = float(items[i+1])
-			self.sqweight[vi] = wt
-		#crossterm weights for predictor variables
-		items = self.config.getStringListConfig("common.crterm.weights")[0]
-		self.crweight = dict()
-		for i in range(0, len(items), 3):
-			vi = int(items[i])
-			assertLesser(vi, self.npvar, "invalid predictor var index")
-			vj = int(items[i+1])
-			assertLesser(vj, self.npvar, "invalid predictor var index")
-			wt = float(items[i+2])
-			vp = (vi, vj)
-			self.crweight[vp] = wt
-		#correlated variables
-		items = self.config.getStringListConfig("common.corr.params")[0]
-		self.corrparams = dict()
-		for co in items:
-			cparam = co.split(":")
-			vi = int(cparam[0])
-			vj = int(cparam[1])
-			k = (vi,vj)
-			bias = float(cparam[2])
-			wt = float(cparam[3])
-			noise = float(cparam[4])
-			roundoff = cparam[5] == "true"
-			v = (bias, wt, noise, roundoff)
-			self.corrparams[k] = v
-		#boas, noise and target range values
-		self.bias = self.config.getFloatConfig("common.bias")[0]
-		noise = self.config.getStringListConfig("common.noise")[0]
-		self.ndistr = noise[0]
-		self.noise = float(noise[1])
-		self.tvarlim = self.config.getFloatListConfig("common.tvar.range")[0]
-		#sample
-		niter = self.config.getIntConfig("common.weight.niter")[0]
-		yvals = list()
-		for i in range(niter):
-			y = self.sample()[1]
-			yvals.append(y)
-		#scale weights by sampled mean and target mean
-		my = statistics.mean(yvals)
-		myt =(self.tvarlim[1] - self.tvarlim[0]) / 2
-		sc = (myt - self.bias) / (my - self.bias)
-		#print("weight scale {:.3f}".format(sc))
-		self.lweights = list(map(lambda w : w * sc, self.lweights))
-		#print("weights {}".format(toStrFromList(self.lweights, 3)))
-		for k in self.sqweight.keys():
-			self.sqweight[k] *= sc
-		for k in self.crweight.keys():
-			self.crweight[k] *= sc
-	def sample(self):
-		"""
-		sample predictor variables and target variable
-		"""
-		pvd = list(map(lambda s : s.sample(), self.samplers))
-		#correct for correlated variables
-		for k in self.corrparams.keys():
-			vi = k[0]
-			vj = k[1]
-			v = self.corrparams[k]
-			bias = v[0]
-			wt = v[1]
-			noise = v[2]
-			roundoff = v[3]
-			nv = bias + wt * pvd[vi]
-			pvd[vj] = preturbScalar(nv, noise, "normal")
-			if roundoff:
-				pvd[vj] = round(pvd[vj])
-		spvd = list()
-		lsum = self.bias
-		for i in range(self.npvar):
-			#range limit
-			if  self.pvranges[i] is not None:
-				pvd[i] = rangeLimit(pvd[i], self.pvranges[i][0], self.pvranges[i][1])
-			spvd.append(pvd[i])
-			#scale
-			pvd[i] = scaleMinMaxScaData(pvd[i], self.pvranges[i])
-			lsum += self.lweights[i] * pvd[i]
-		#square terms
-		ssum = 0
-		for k in self.sqweight.keys():
-			ssum += self.sqweight[k] + pvd[k] * pvd[k]
-		#cross terms
-		crsum = 0
-		for k in self.crweight.keys():
-			vi = k[0]
-			vj = k[1]
-			crsum += self.crweight[k] * pvd[vi] * pvd[vj]
-		y = lsum + ssum + crsum
-		y = preturbScalar(y, self.noise, self.ndistr)
-		if self.callback is not None:
-			ufy = self.callback(spvd)
-			y += ufy
-		r = (spvd, y)
-		return r
-def loadDataFile(file, delim, cols, colIndices):
-	"""
-	loads delim separated file and extracts columns
-	Parameters
-		file : file path
-		delim : delemeter
-		cols : columns to use from file
-		colIndices ; columns to extract
-	"""
-	data = np.loadtxt(file, delimiter=delim, usecols=cols)
-	extrData = data[:,colIndices]
-	return (data, extrData)
-def loadFeatDataFile(file, delim, cols):
-	"""
-	loads delim separated file and extracts columns
-	Parameters
-		file : file path
-		delim : delemeter
-		cols : columns to use from file
-	"""
-	data = np.loadtxt(file, delimiter=delim, usecols=cols)
-	return data
-def extrColumns(arr, columns):
-	"""
-	extracts columns
-	Parameters
-		arr : 2D array
-		columns : columns
-	"""
-	return arr[:, columns]
-def subSample(featData, clsData, subSampleRate, withReplacement):
-	"""
-	subsample feature and class label data
-	Parameters
-		featData : 2D array of feature data
-		clsData : arrray of class labels
-		subSampleRate : fraction to be sampled
-		withReplacement : true if sampling with replacement
-	"""
-	sampSize = int(featData.shape[0] * subSampleRate)
-	sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)
-	sampFeat = featData[sampledIndx]
-	sampCls = clsData[sampledIndx]
-	return(sampFeat, sampCls)
-def euclideanDistance(x,y):
-	"""
-	euclidean distance
-	Parameters
-		x : first vector
-		y : second fvector
-	"""
-	return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))
-def squareRooted(x):
-	"""
-	square root of sum square
-	Parameters
-		x : data vector
-	"""
-	return round(sqrt(sum([a*a for a in x])),3)
-def cosineSimilarity(x,y):
-	"""
-	cosine similarity
-	Parameters
-		x : first vector
-		y : second fvector
-	"""
-	numerator = sum(a*b for a,b in zip(x,y))
-	denominator = squareRooted(x) * squareRooted(y)
-	return round(numerator / float(denominator), 3)
-def cosineDistance(x,y):
-	"""
-	cosine distance
-	Parameters
-		x : first vector
-		y : second fvector
-	"""
-	return 1.0 - cosineSimilarity(x,y)
-def manhattanDistance(x,y):
-	"""
-	manhattan distance
-	Parameters
-		x : first vector
-		y : second fvector
-	"""
-	return sum(abs(a-b) for a,b in zip(x,y))
-def nthRoot(value, nRoot):
-	"""
-	nth root
-	Parameters
-		value : data value
-		nRoot : root
-	"""
-	rootValue = 1/float(nRoot)
-	return round (Decimal(value) ** Decimal(rootValue),3)
-def minkowskiDistance(x,y,pValue):
-	"""
-	minkowski distance
-	Parameters
-		x : first vector
-		y : second fvector
-		pValue : power factor
-	"""
-	return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)
-def jaccardSimilarityX(x,y):
-	"""
-	jaccard similarity
-	Parameters
-		x : first vector
-		y : second fvector
-	"""
-	intersectionCardinality = len(set.intersection(*[set(x), set(y)]))
-	unionCardinality = len(set.union(*[set(x), set(y)]))
-	return intersectionCardinality/float(unionCardinality)
-def jaccardSimilarity(x,y,wx=1.0,wy=1.0):
-	"""
-	jaccard similarity
-	Parameters
-		x : first vector
-		y : second fvector
-		wx : weight for x
-		wy : weight for y
-	"""
-	sx = set(x)
-	sy = set(y)
-	sxyInt = sx.intersection(sy)
-	intCardinality = len(sxyInt)
-	sxIntDiff = sx.difference(sxyInt)
-	syIntDiff = sy.difference(sxyInt)
-	unionCardinality = len(sx.union(sy))
-	return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))
-def levenshteinSimilarity(s1, s2):
-	"""
-	Levenshtein similarity for strings
-	Parameters
-		sx : first string
-		sy : second string
-	"""
-	assert type(s1) == str and type(s2) == str,  "Levenshtein similarity is for string only"
-	d = ld(s1,s2)
-	#print(d)
-	l = max(len(s1),len(s2))
-	d = 1.0 - min(d/l, 1.0)
-	return d
-def norm(values, po=2):
-	"""
-	norm
-	Parameters
-		values : list of values
-		po : power
-	"""
-	no = sum(list(map(lambda v: pow(v,po), values)))
-	no = pow(no,1.0/po)
-	return list(map(lambda v: v/no, values))
-def createOneHotVec(size, indx = -1):
-	"""
-	random one hot vector
-	Parameters
-		size : vector size
-		indx : one hot position
-	"""
-	vec = [0] * size
-	s = random.randint(0, size - 1) if indx < 0 else indx
-	vec[s] = 1
-	return vec
-def createAllOneHotVec(size):
-	"""
-	create all one hot vectors
-	Parameters
-		size : vector size and no of vectors
-	"""
-	vecs = list()
-	for i in range(size):
-		vec = [0] * size
-		vec[i] = 1
-		vecs.append(vec)
-	return vecs
-def blockShuffle(data, blockSize):
-	"""
-	block shuffle
-	Parameters
-		data : list data
-		blockSize : block size
-	"""
-	numBlock = int(len(data) / blockSize)
-	remain = len(data) % blockSize
-	numBlock +=  (1 if remain > 0 else 0)
-	shuffled = list()
-	for i in range(numBlock):
-		b = random.randint(0, numBlock-1)
-		beg = b * blockSize
-		if (b < numBlock-1):
-			end = beg + blockSize
-			shuffled.extend(data[beg:end])
-		else:
-			shuffled.extend(data[beg:])
-	return shuffled
-def shuffle(data, numShuffle):
-	"""
-	shuffle data by randonm swapping
-	Parameters
-		data : list data
-		numShuffle : no of pairwise swaps
-	"""
-	sz = len(data)
-	if numShuffle is None:
-		numShuffle = int(sz / 2)
-	for i in range(numShuffle):
-		fi = random.randint(0, sz -1)
-		se = random.randint(0, sz -1)
-		tmp = data[fi]
-		data[fi] = data[se]
-		data[se] = tmp
-def randomWalk(size, start, lowStep, highStep):
-	"""
-	random walk
-	Parameters
-		size : list data
-		start : initial position
-		lowStep : step min
-		highStep : step max
-	"""
-	cur = start
-	for i in range(size):
-		yield cur
-		cur += randomFloat(lowStep, highStep)
-def binaryEcodeCategorical(values, value):
-	"""
-	one hot binary encoding
-	Parameters
-		values : list of values
-		value : value to be replaced with 1
-	"""
-	size = len(values)
-	vec = [0] * size
-	for i in range(size):
-		if (values[i] == value):
-			vec[i] = 1
-	return vec
-def createLabeledSeq(inputData, tw):
-	"""
-	Creates feature, label pair from sequence data, where we have tw number of features followed by output
-	Parameters
-		values : list containing feature and label
-		tw : no of features
-	"""
-	features = list()
-	labels = list()
-	l = len(inputDta)
-	for i in range(l - tw):
-		trainSeq = inputData[i:i+tw]
-		trainLabel = inputData[i+tw]
-		features.append(trainSeq)
-		labels.append(trainLabel)
-	return (features, labels)
-def createLabeledSeq(filePath, delim, index, tw):
-	"""
-	Creates feature, label pair from 1D sequence data in file
-	Parameters
-		filePath : file path
-		delim : delemeter
-		index : column index
-		tw : no of features
-	"""
-	seqData = getFileColumnAsFloat(filePath, delim, index)
-	return createLabeledSeq(seqData, tw)
-def fromMultDimSeqToTabular(data, inpSize, seqLen):
-	"""
-	Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)
-	Parameters
-		data : 2D array
-		inpSize : each input size in sequence
-		seqLen : sequence length
-	"""
-	nrow = data.shape[0]
-	assert data.shape[1] == inpSize * seqLen, "invalid input size or sequence length"
-	return data.reshape(nrow * seqLen, inpSize)
-def fromTabularToMultDimSeq(data, inpSize, seqLen):
-	"""
-	Input shape (nrow * seqLen, inpSize)   output  shape (nrow, inpSize * seqLen)
-	Parameters
-		data : 2D array
-		inpSize : each input size in sequence
-		seqLen : sequence length
-	"""
-	nrow = int(data.shape[0] / seqLen)
-	assert data.shape[1] == inpSize, "invalid input size"
-	return data.reshape(nrow,  seqLen * inpSize)
-def difference(data, interval=1):
-	"""
-	takes difference in time series data
-	Parameters
-		data :list data
-		interval : interval for difference
-	"""
-	diff = list()
-	for i in range(interval, len(data)):
-		value = data[i] - data[i - interval]
-		diff.append(value)
-	return diff
-def normalizeMatrix(data, norm, axis=1):
-	"""
-	normalized each row of the matrix
-	Parameters
-		data : 2D data
-		nporm : normalization method
-		axis : row or column
-	"""
-	normalized = preprocessing.normalize(data,norm=norm, axis=axis)
-	return normalized
-def standardizeMatrix(data, axis=0):
-	"""
-	standardizes each column of the matrix with mean and std deviation
-	Parameters
-		data : 2D data
-		axis : row or column
-	"""
-	standardized = preprocessing.scale(data, axis=axis)
-	return standardized
-def asNumpyArray(data):
-	"""
-	converts to numpy array
-	Parameters
-		data  : array
-	"""
-	return np.array(data)
-def perfMetric(metric, yActual, yPred, clabels=None):
-	"""
-	predictive model accuracy metric
-	Parameters
-		metric : accuracy metric
-		yActual : actual values array
-		yPred : predicted values array
-		clabels : class labels
-	"""
-	if metric == "rsquare":
-		score = metrics.r2_score(yActual, yPred)
-	elif metric == "mae":
-		score = metrics.mean_absolute_error(yActual, yPred)
-	elif metric == "mse":
-		score = metrics.mean_squared_error(yActual, yPred)
-	elif metric == "acc":
-		yPred = np.rint(yPred)
-		score = metrics.accuracy_score(yActual, yPred)
-	elif metric == "mlAcc":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.accuracy_score(yActual, yPred)
-	elif metric == "prec":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.precision_score(yActual, yPred)
-	elif metric == "rec":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.recall_score(yActual, yPred)
-	elif metric == "fone":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.f1_score(yActual, yPred)
-	elif metric == "confm":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.confusion_matrix(yActual, yPred)
-	elif metric == "clarep":
-		yPred = np.argmax(yPred, axis=1)
-		score = metrics.classification_report(yActual, yPred)
-	elif metric == "bce":
-		if clabels is None:
-			clabels = [0, 1]
-		score = metrics.log_loss(yActual, yPred, labels=clabels)
-	elif metric == "ce":
-		assert clabels is not None, "labels must be provided"
-		score = metrics.log_loss(yActual, yPred, labels=clabels)
-	else:
-		exitWithMsg("invalid prediction performance metric " + metric)
-	return score
-def scaleData(data, method):
-	"""
-	scales feature data column wise
-	Parameters
-		data : 2D array
-		method : scaling method
-	"""
-	if method == "minmax":
-		scaler = preprocessing.MinMaxScaler()
-		data = scaler.fit_transform(data)
-	elif method == "zscale":
-		data = preprocessing.scale(data)
-	else:
-		raise ValueError("invalid scaling method")
-	return data
-def scaleDataWithParams(data, method, scParams):
-	"""
-	scales feature data column wise
-	Parameters
-		data : 2D array
-		method : scaling method
-		scParams : scaling parameters
-	"""
-	if method == "minmax":
-		data = scaleMinMaxTabData(data, scParams)
-	elif method == "zscale":
-		raise ValueError("invalid scaling method")
-	else:
-		raise ValueError("invalid scaling method")
-	return data
-def scaleMinMaxScaData(data, minMax):
-	"""
-	minmax scales scalar data
-	Parameters
-		data : scalar data
-		minMax : min, max and range for each column
-	"""
-	sd = (data - minMax[0]) / minMax[2]
-	return sd
-def scaleMinMaxTabData(tdata, minMax):
-	"""
-	for tabular scales feature data column wise using min max values for each field
-	Parameters
-		tdata : 2D array
-		minMax : min, max and range for each column
-	"""
-	stdata = list()
-	for r in tdata:
-		srdata = list()
-		for i, c in enumerate(r):
-			sd = (c - minMax[i][0]) / minMax[i][2]
-			srdata.append(sd)
-		stdata.append(srdata)
-	return stdata
-def scaleMinMax(rdata, minMax):
-	"""
-	scales feature data column wise using min max values for each field
-	Parameters
-		rdata : data array
-		minMax : min, max and range for each column
-	"""
-	srdata = list()
-	for i in range(len(rdata)):
-		d = rdata[i]
-		sd = (d - minMax[i][0]) / minMax[i][2]
-		srdata.append(sd)
-	return srdata
-def harmonicNum(n):
-	"""
-	harmonic number
-	Parameters
-		n : number
-	"""
-	h = 0
-	for i in range(1, n+1, 1):
-		h += 1.0 / i
-	return h
-def digammaFun(n):
-	"""
-	figamma function
-	Parameters
-		n : number
-	"""
-	#Euler Mascheroni constant
-	ec = 0.577216
-	return harmonicNum(n - 1) - ec
-def getDataPartitions(tdata, types, columns = None):
-	"""
-	partitions data with the given columns and random split point defined with predicates
-	Parameters
-		tdata : 2D array
-		types : data typers
-		columns : column indexes
-	"""
-	(dtypes, cvalues) = extractTypesFromString(types)
-	if columns is None:
-		ncol = len(data[0])
-		columns = list(range(ncol))
-	ncol = len(columns)
-	#print(columns)
-	# partition predicates
-	partitions = None
-	for c in columns:
-		#print(c)
-		dtype = dtypes[c]
-		pred = list()
-		if dtype == "int" or dtype == "float":
-			(vmin, vmax) = getColMinMax(tdata, c)
-			r = vmax - vmin
-			rmin = vmin + .2 * r
-			rmax = vmax - .2 * r
-			sp = randomFloat(rmin, rmax)
-			if dtype == "int":
-				sp = int(sp)
-			else:
-				sp = "{:.3f}".format(sp)
-				sp = float(sp)
-			pred.append([c, "LT", sp])
-			pred.append([c, "GE", sp])
-		elif dtype == "cat":
-			cv = cvalues[c]
-			card = len(cv)
-			if card < 3:
-				num = 1
-			else:
-				num = randomInt(1, card - 1)
-			sp = selectRandomSubListFromList(cv, num)
-			sp = " ".join(sp)
-			pred.append([c, "IN", sp])
-			pred.append([c, "NOTIN", sp])
-		#print(pred)
-		if partitions is None:
-			partitions = pred.copy()
-			#print("initial")
-			#print(partitions)
-		else:
-			#print("extension")
-			tparts = list()
-			for p in partitions:
-				#print(p)
-				l1 = p.copy()
-				l1.extend(pred[0])
-				l2 = p.copy()
-				l2.extend(pred[1])
-				#print("after extension")
-				#print(l1)
-				#print(l2)
-				tparts.append(l1)
-				tparts.append(l2)
-			partitions = tparts
-			#print("extending")
-			#print(partitions)
-	#for p in partitions:
-		#print(p)
-	return partitions
-def genAlmostUniformDistr(size, nswap=50):
-	"""
-	generate probability distribution
-	Parameters
-		size : distr size
-		nswap : no of mass swaps
-	"""
-	un = 1.0 / size
-	distr = [un] * size
-	distr = mutDistr(distr, 0.1 * un, nswap)
-	return distr
-def mutDistr(distr, shift, nswap=50):
-	"""
-	mutates a probability distribution
-	Parameters
-		distr distribution
-		shift : amount of shift for swap
-		nswap : no of mass swaps
-	"""
-	size = len(distr)
-	for _ in range(nswap):
-		fi = randomInt(0, size -1)
-		si = randomInt(0, size -1)
-		while fi == si:
-			fi = randomInt(0, size -1)
-			si = randomInt(0, size -1)
-		shift = randomFloat(0, shift)
-		t = distr[fi]
-		distr[fi] -= shift
-		if (distr[fi] < 0):
-			distr[fi] = 0.0
-			shift = t
-		distr[si] += shift
-	return distr
-def generateBinDistribution(size, ntrue):
-	"""
-	generate binary array with some elements set to 1
-	Parameters
-		size : distr size
-		ntrue : no of true values
-	"""
-	distr = [0] * size
-	idxs = selectRandomSubListFromList(list(range(size)), ntrue)
-	for i in idxs:
-		distr[i] = 1
-	return distr
-def mutBinaryDistr(distr, nmut):
-	"""
-	mutate binary distribution
-	Parameters
-		distr : distr
-		nmut : no of mutations
-	"""
-	idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)
-	for i in idxs:
-		distr[i] = distr[i] ^ 1
-	return distr
-def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=","):
-	"""
-	file record generator that superimposes given data in the specified segment of a column
-	Parameters
-		filePath ; file path
-		column : column index
-		offset : offset into column values
-		seqLen : length of subseq
-		modifier : data to be superimposed either list or a sampler object
-		precision : floating point precision
-		delim : delemeter
-	"""
-	beg = offset
-	end = beg + seqLen
-	isList = type(modifier) == list
-	i = 0
-	for rec in fileRecGen(filePath, delim):
-		if i >= beg and i < end:
-			va = float(rec[column])
-			if isList:
-				va += modifier[i - beg]
-			else:
-				va += modifier.sample()
-			rec[column] = formatFloat(precision, va)
-		yield delim.join(rec)
-		i += 1
-class ShiftedDataGenerator:
-	"""
-	transforms data for distribution shift
-	"""
-	def __init__(self, types, tdata, addFact, multFact):
-		"""
-		initializer
-		Parameters
-			types data types
-			tdata : 2D array
-			addFact ; factor for data shift
-			multFact ; factor for data scaling
-		"""
-		(self.dtypes, self.cvalues) = extractTypesFromString(types)
-		self.limits = dict()
-		for k,v in self.dtypes.items():
-			if v == "int" or v == "false":
-				(vmax, vmin) = getColMinMax(tdata, k)
-				self.limits[k] = vmax - vmin
-		self.addMin = - addFact / 2
-		self.addMax =  addFact / 2
-		self.multMin = 1.0 - multFact / 2
-		self.multMax = 1.0 + multFact / 2
-	def transform(self, tdata):
-		"""
-		linear transforms data to create  distribution shift with random shift and scale
-		Parameters
-			types : data types
-		"""
-		transforms = dict()
-		for k,v in self.dtypes.items():
-			if v == "int" or v == "false":
-				shift = randomFloat(self.addMin, self.addMax) * self.limits[k]
-				scale = randomFloat(self.multMin, self.multMax)
-				trns = (shift, scale)
-				transforms[k] = trns
-			elif v == "cat":
-				transforms[k] = isEventSampled(50)
-		ttdata = list()
-		for rec in tdata:
-			nrec = rec.copy()
-			for c in range(len(rec)):
-				if c in self.dtypes:
-					dtype = self.dtypes[c]
-					if dtype == "int" or dtype == "float":
-						(shift, scale) = transforms[c]
-						nval = shift +  rec[c] * scale
-						if dtype == "int":
-							nrec[c] = int(nval)
-						else:
-							nrec[c] = nval
-					elif dtype == "cat":
-						cv = self.cvalues[c]
-						if transforms[c]:
-							nval = selectOtherRandomFromList(cv, rec[c])
-							nrec[c] = nval
-			ttdata.append(nrec)
-		return ttdata
-	def transformSpecified(self, tdata, sshift, scale):
-		"""
-		linear transforms data to create  distribution shift shift specified shift and scale
-		Parameters
-			types : data types
-			sshift : shift factor
-			scale : scale factor
-		"""
-		transforms = dict()
-		for k,v in self.dtypes.items():
-			if v == "int" or v == "false":
-				shift = sshift * self.limits[k]
-				trns = (shift, scale)
-				transforms[k] = trns
-			elif v == "cat":
-				transforms[k] = isEventSampled(50)
-		ttdata = self.__scaleShift(tdata, transforms)
-		return ttdata
-	def __scaleShift(self, tdata, transforms):
-		"""
-		shifts and scales tabular data
-		Parameters
-			tdata : 2D array
-			transforms : transforms to apply
-		"""
-		ttdata = list()
-		for rec in tdata:
-			nrec = rec.copy()
-			for c in range(len(rec)):
-				if c in self.dtypes:
-					dtype = self.dtypes[c]
-					if dtype == "int" or dtype == "float":
-						(shift, scale) = transforms[c]
-						nval = shift + rec[c] * scale
-						if dtype == "int":
-							nrec[c] = int(nval)
-						else:
-							nrec[c] = nval
-					elif dtype == "cat":
-						cv = self.cvalues[c]
-						if transforms[c]:
-							#nval = selectOtherRandomFromList(cv, rec[c])
-							#nrec[c] = nval
-							pass
-			ttdata.append(nrec)
-		return ttdata
-class RollingStat(object):
-	"""
-	stats for rolling windowt
-	"""
-	def __init__(self, wsize):
-		"""
-		initializer
-		Parameters
-			wsize : window size
-		"""
-		self.window = list()
-		self.wsize = wsize
-		self.mean = None
-		self.sd = None
-	def add(self, value):
-		"""
-		add a value
-		Parameters
-			value : value to add
-		"""
-		self.window.append(value)
-		if len(self.window) > self.wsize:
-			self.window = self.window[1:]
-	def getStat(self):
-		"""
-		get rolling window mean and std deviation
-		"""
-		assertGreater(len(self.window), 0, "window is empty")
-		if len(self.window) == 1:
-			self.mean = self.window[0]
-			self.sd = 0
-		else:
-			self.mean = statistics.mean(self.window)
-			self.sd = statistics.stdev(self.window, xbar=self.mean)
-		re = (self.mean, self.sd)
-		return re
-	def getSize(self):
-		"""
-		return window size
-		"""
-		return len(self.window)

matumizi/matumizi/sampler.py DELETED Viewed

@@ -1,1455 +0,0 @@
-#!/usr/local/bin/python3
-# avenir-python: Machine Learning
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import sys
-import random
-import time
-import math
-import random
-import numpy as np
-from scipy import stats
-from random import randint
-from .util import *
-from .stats import Histogram
-def randomFloat(low, high):
-	"""
-	sample float within range
-	Parameters
-		low : low valuee
-		high : high valuee
-	"""
-	return random.random() * (high-low) + low
-def randomInt(minv, maxv):
-	"""
-	sample int within range
-	Parameters
-		minv : low valuee
-		maxv : high valuee
-	"""
-	return randint(minv, maxv)
-def randIndex(lData):
-	"""
-	random index of a list
-	Parameters
-		lData : list data
-	"""
-	return randint(0, len(lData)-1)
-def randomUniformSampled(low, high):
-	"""
-	sample float within range
-	Parameters
-		low : low value
-		high : high value
-	"""
-	return np.random.uniform(low, high)
-def randomUniformSampledList(low, high, size):
-	"""
-	sample floats within range to create list
-	Parameters
-		low : low value
-		high : high value
-		size ; size of list to be returned
-	"""
-	return np.random.uniform(low, high, size)
-def randomNormSampled(mean, sd):
-	"""
-	sample float from normal
-	Parameters
-		mean : mean
-		sd : std deviation
-	"""
-	return np.random.normal(mean, sd)
-def randomNormSampledList(mean, sd, size):
-	"""
-	sample float list from normal
-	Parameters
-		mean : mean
-		sd : std deviation
-		size : size of list to be returned
-	"""
-	return np.random.normal(mean, sd, size)
-def randomSampledList(sampler, size):
-	"""
-	sample list from given sampler
-	Parameters
-		sampler : sampler object
-		size : size of list to be returned
-	"""
-	return list(map(lambda i : sampler.sample(), range(size)))
-def minLimit(val, minv):
-	"""
-	min limit
-	Parameters
-		val : value
-		minv : min limit
-	"""
-	if (val < minv):
-		val = minv
-	return val
-def rangeLimit(val, minv, maxv):
-	"""
-	range limit
-	Parameters
-		val : value
-		minv : min limit
-		maxv : max limit
-	"""
-	if (val < minv):
-		val = minv
-	elif (val > maxv):
-		val = maxv
-	return val
-def sampleUniform(minv, maxv):
-	"""
-	sample int within range
-	Parameters
-		minv ; int min limit
-		maxv : int max limit
-	"""
-	return randint(minv, maxv)
-def sampleFromBase(value, dev):
-	"""
-	sample int wrt base
-	Parameters
-		value : base value
-		dev : deviation
-	"""
-	return randint(value - dev, value + dev)
-def sampleFloatFromBase(value, dev):
-	"""
-	sample float wrt base
-	Parameters
-		value : base value
-		dev : deviation
-	"""
-	return randomFloat(value - dev, value + dev)
-def distrUniformWithRanndom(total, numItems, noiseLevel):
-	"""
-	uniformly distribute with some randomness and preserves total
-	Parameters
-		total : total count
-		numItems : no of bins
-		noiseLevel : noise level fraction
-	"""
-	perItem = total / numItems
-	var = perItem * noiseLevel
-	items = []
-	for i in range(numItems):
-		item = perItem + randomFloat(-var, var)
-		items.append(item)
-	#adjust last item
-	sm = sum(items[:-1])
-	items[-1] = total - sm
-	return items
-def isEventSampled(threshold, maxv=100):
-	"""
-	sample event which occurs if sampled below threshold
-	Parameters
-		threshold : threshold for sampling
-		maxv : maximum values
-	"""
-	return randint(0, maxv) < threshold
-def sampleBinaryEvents(events, probPercent):
-	"""
-	sample binary events
-	Parameters
-		events : two events
-		probPercent : probability as percentage
-	"""
-	if (randint(0, 100) < probPercent):
-		event = events[0]
-	else:
-		event = events[1]
-	return event
-def addNoiseNum(value, sampler):
-	"""
-	add noise to numeric value
-	Parameters
-		value : base value
-		sampler : sampler for noise
-	"""
-	return value * (1 + sampler.sample())
-def addNoiseCat(value, values, noise):
-	"""
-	add noise to categorical value i.e with some probability change value
-	Parameters
-		value : cat value
-		values : cat values
-		noise : noise level fraction
-	"""
-	newValue = value
-	threshold = int(noise * 100)
-	if (isEventSampled(threshold)):
-		newValue = selectRandomFromList(values)
-		while newValue == value:
-			newValue = selectRandomFromList(values)
-	return newValue
-def sampleWithReplace(data, sampSize):
-	"""
-	sample with replacement
-	Parameters
-		data : array
-		sampSize : sample size
-	"""
-	sampled = list()
-	le = len(data)
-	if sampSize is None:
-		sampSize = le
-	for i in range(sampSize):
-		j = random.randint(0, le - 1)
-		sampled.append(data[j])
-	return sampled
-class CumDistr:
-	"""
-	cumulative distr
-	"""
-	def __init__(self, data, numBins = None):
-		"""
-		initializer
-		Parameters
-			data : array
-			numBins : no of bins
-		"""
-		if not numBins:
-			numBins = int(len(data) / 5)
-		res = stats.cumfreq(data, numbins=numBins)
-		self.cdistr = res.cumcount / len(data)
-		self.loLim = res.lowerlimit
-		self.upLim = res.lowerlimit + res.binsize * res.cumcount.size
-		self.binWidth = res.binsize
-	def getDistr(self, value):
-		"""
-		get cumulative distribution
-		Parameters
-			value : value
-		"""
-		if value <= self.loLim:
-			d = 0.0
-		elif value >= self.upLim:
-			d = 1.0
-		else:
-			bin = int((value - self.loLim) / self.binWidth)
-			d = self.cdistr[bin]
-		return d
-class BernoulliTrialSampler:
-	"""
-	bernoulli trial sampler return True or False
-	"""
-	def __init__(self, pr, events=None):
-		"""
-		initializer
-		Parameters
-			pr : probability
-			events : event values
-		"""
-		self.pr = pr
-		self.retEvent = False if events is None else True
-		self.events = events
-	def sample(self):
-		"""
-		samples value
-		"""
-		res = random.random() < self.pr
-		if self.retEvent:
-			res = self.events[0] if res else self.events[1]
-		return res
-class PoissonSampler:
-	"""
-	poisson sampler returns number of events
-	"""
-	def __init__(self, rateOccur, maxSamp):
-		"""
-		initializer
-		Parameters
-			rateOccur : rate of occurence
-			maxSamp : max limit on no of samples
-		"""
-		self.rateOccur = rateOccur
-		self.maxSamp = int(maxSamp)
-		self.pmax = self.calculatePr(rateOccur)
-	def calculatePr(self, numOccur):
-		"""
-		calulates probability
-		Parameters
-			numOccur : no of occurence
-		"""
-		p = (self.rateOccur ** numOccur) * math.exp(-self.rateOccur) / math.factorial(numOccur)
-		return p
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = 0
-		while not done:
-			no = randint(0, self.maxSamp)
-			sp = randomFloat(0.0, self.pmax)
-			ap = self.calculatePr(no)
-			if sp < ap:
-				done = True
-				samp = no
-		return samp
-class ExponentialSampler:
-	"""
-	returns interval between events
-	"""
-	def __init__(self, rateOccur, maxSamp = None):
-		"""
-		initializer
-		Parameters
-			rateOccur : rate of occurence
-			maxSamp : max limit on interval
-		"""
-		self.interval = 1.0 / rateOccur
-		self.maxSamp = int(maxSamp) if maxSamp is not None else None
-	def sample(self):
-		"""
-		samples value
-		"""
-		sampled = np.random.exponential(scale=self.interval)
-		if self.maxSamp is not None:
-			while sampled > self.maxSamp:
-				sampled = np.random.exponential(scale=self.interval)
-		return sampled
-class UniformNumericSampler:
-	"""
-	uniform sampler for numerical values
-	"""
-	def __init__(self, minv, maxv):
-		"""
-		initializer
-		Parameters
-			minv : min value
-			maxv : max value
-		"""
-		self.minv = minv
-		self.maxv = maxv
-	def isNumeric(self):
-		"""
-		returns true
-		"""
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		samp =	sampleUniform(self.minv, self.maxv) if isinstance(self.minv, int) else randomFloat(self.minv, self.maxv)
-		return samp
-class UniformCategoricalSampler:
-	"""
-	uniform sampler for categorical values
-	"""
-	def __init__(self, cvalues):
-		"""
-		initializer
-		Parameters
-			cvalues : categorical value list
-		"""
-		self.cvalues = cvalues
-	def isNumeric(self):
-		return False
-	def sample(self):
-		"""
-		samples value
-		"""
-		return selectRandomFromList(self.cvalues)
-class NormalSampler:
-	"""
-	normal sampler
-	"""
-	def __init__(self, mean, stdDev):
-		"""
-		initializer
-		Parameters
-			mean : mean
-			stdDev : std deviation
-		"""
-		self.mean = mean
-		self.stdDev = stdDev
-		self.sampleAsInt = False
-	def isNumeric(self):
-		return True
-	def sampleAsIntValue(self):
-		"""
-		set True to sample as int
-		"""
-		self.sampleAsInt = True
-	def sample(self):
-		"""
-		samples value
-		"""
-		samp =  np.random.normal(self.mean, self.stdDev)
-		if self.sampleAsInt:
-			samp = int(samp)
-		return samp
-class LogNormalSampler:
-	"""
-	log normal sampler
-	"""
-	def __init__(self, mean, stdDev):
-		"""
-		initializer
-		Parameters
-			mean : mean
-			stdDev : std deviation
-		"""
-		self.mean = mean
-		self.stdDev = stdDev
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		return np.random.lognormal(self.mean, self.stdDev)
-class NormalSamplerWithTrendCycle:
-	"""
-	normal sampler with cycle and trend
-	"""
-	def __init__(self, mean, stdDev, dmean, cycle,  step=1):
-		"""
-		initializer
-		Parameters
-			mean : mean
-			stdDev : std deviation
-			dmean : trend delta
-			cycle : cycle values wrt base mean
-			step : adjustment step for cycle and trend
-		"""
-		self.mean = mean
-		self.cmean = mean
-		self.stdDev = stdDev
-		self.dmean = dmean
-		self.cycle = cycle
-		self.clen = len(cycle) if cycle is not None else 0
-		self.step = step
-		self.count = 0
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		s = np.random.normal(self.cmean, self.stdDev)
-		self.count += 1
-		if self.count % self.step == 0:
-			cy = 0
-			if self.clen > 1:
-				coff =  self.count % self.clen
-				cy = self.cycle[coff]
-			tr = self.count * self.dmean
-			self.cmean = self.mean + tr + cy
-		return s
-class ParetoSampler:
-	"""
-	pareto sampler
-	"""
-	def __init__(self, mode, shape):
-		"""
-		initializer
-		Parameters
-			mode : mode
-			shape : shape
-		"""
-		self.mode = mode
-		self.shape = shape
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		return (np.random.pareto(self.shape) + 1) * self.mode
-class GammaSampler:
-	"""
-	pareto sampler
-	"""
-	def __init__(self, shape, scale):
-		"""
-		initializer
-		Parameters
-			shape : shape
-			scale : scale
-		"""
-		self.shape = shape
-		self.scale = scale
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		return np.random.gamma(self.shape, self.scale)
-class GaussianRejectSampler:
-	"""
-	gaussian sampling based on rejection sampling
-	"""
-	def __init__(self, mean, stdDev):
-		"""
-		initializer
-		Parameters
-			mean : mean
-			stdDev : std deviation
-		"""
-		self.mean = mean
-		self.stdDev = stdDev
-		self.xmin = mean - 3 * stdDev
-		self.xmax = mean + 3 * stdDev
-		self.ymin = 0.0
-		self.fmax = 1.0 / (math.sqrt(2.0 * 3.14) * stdDev)
-		self.ymax = 1.05 * self.fmax
-		self.sampleAsInt = False
-	def isNumeric(self):
-		return True
-	def sampleAsIntValue(self):
-		"""
-		sample as int value
-		"""
-		self.sampleAsInt = True
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = 0
-		while not done:
-			x = randomFloat(self.xmin, self.xmax)
-			y = randomFloat(self.ymin, self.ymax)
-			f = self.fmax * math.exp(-(x - self.mean) * (x - self.mean) / (2.0 * self.stdDev * self.stdDev))
-			if (y < f):
-				done = True
-				samp = x
-		if self.sampleAsInt:
-			samp = int(samp)
-		return samp
-class DiscreteRejectSampler:
-	"""
-	non parametric sampling for discrete values  using given distribution based
-	on rejection sampling
-	"""
-	def __init__(self,  xmin, xmax, step, *values):
-		"""
-		initializer
-		Parameters
-			xmin : min  value
-			xmax : max  value
-			step : discrete step
-			values : distr values
-		"""
-		self.xmin = xmin
-		self.xmax = xmax
-		self.step = step
-		self.distr = values
-		if (len(self.distr) == 1):
-			self.distr = self.distr[0]
-		numSteps = int((self.xmax - self.xmin) / self.step)
-		#print("{:.3f} {:.3f} {:.3f} {}".format(self.xmin, self.xmax, self.step, numSteps))
-		assert len(self.distr)	== numSteps + 1, "invalid number of distr values expected {}".format(numSteps + 1)
-		self.ximin = 0
-		self.ximax = numSteps
-		self.pmax = float(max(self.distr))
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = None
-		while not done:
-			xi = randint(self.ximin, self.ximax)
-			#print(formatAny(xi, "xi"))
-			ps = randomFloat(0.0, self.pmax)
-			pa = self.distr[xi]
-			if ps < pa:
-				samp = self.xmin + xi  * self.step
-				done = True
-		return samp
-class TriangularRejectSampler:
-	"""
-	non parametric sampling using triangular distribution based on rejection sampling
-	"""
-	def __init__(self, xmin, xmax, vertexValue, vertexPos=None):
-		"""
-		initializer
-		Parameters
-			xmin : min  value
-			xmax : max  value
-			vertexValue : distr value at vertex
-			vertexPos : vertex pposition
-		"""
-		self.xmin = xmin
-		self.xmax = xmax
-		self.vertexValue = vertexValue
-		if vertexPos:
-			assert vertexPos > xmin and vertexPos < xmax, "vertex position outside bound"
-			self.vertexPos = vertexPos
-		else:
-			self.vertexPos = 0.5 * (xmin + xmax)
-		self.s1 = vertexValue / (self.vertexPos - xmin)
-		self.s2 = vertexValue / (xmax - self.vertexPos)
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = None
-		while not done:
-			x = randomFloat(self.xmin, self.xmax)
-			y = randomFloat(0.0, self.vertexValue)
-			f = (x - self.xmin) * self.s1 if x < self.vertexPos else (self.xmax - x) * self.s2
-			if (y < f):
-				done = True
-				samp = x
-		return samp;
-class NonParamRejectSampler:
-	"""
-	non parametric sampling using given distribution based on rejection sampling
-	"""
-	def __init__(self, xmin, binWidth, *values):
-		"""
-		initializer
-		Parameters
-			xmin : min  value
-			binWidth : bin width
-			values : distr values
-		"""
-		self.values = values
-		if (len(self.values) == 1):
-			self.values = self.values[0]
-		self.xmin = xmin
-		self.xmax = xmin + binWidth * (len(self.values) - 1)
-		#print(self.xmin, self.xmax, binWidth)
-		self.binWidth = binWidth
-		self.fmax = 0
-		for v in self.values:
-			if (v > self.fmax):
-				self.fmax = v
-		self.ymin = 0
-		self.ymax = self.fmax
-		self.sampleAsInt = True
-	def isNumeric(self):
-		return True
-	def sampleAsFloat(self):
-		self.sampleAsInt = False
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = 0
-		while not done:
-			if self.sampleAsInt:
-				x = random.randint(self.xmin, self.xmax)
-				y = random.randint(self.ymin, self.ymax)
-			else:
-				x = randomFloat(self.xmin, self.xmax)
-				y = randomFloat(self.ymin, self.ymax)
-			bin = int((x - self.xmin) / self.binWidth)
-			f = self.values[bin]
-			if (y < f):
-				done = True
-				samp = x
-		return samp
-class JointNonParamRejectSampler:
-	"""
-	non parametric sampling using given distribution based on rejection sampling
-	"""
-	def __init__(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):
-		"""
-		initializer
-		Parameters
-			xmin : min  value for x
-			xbinWidth : bin width for x
-			xnbin : no of bins for x
-			ymin : min  value for y
-			ybinWidth : bin width for y
-			ynbin : no of bins for y
-			values : distr values
-		"""
-		self.values = values
-		if (len(self.values) == 1):
-			self.values = self.values[0]
-		assert len(self.values) ==  xnbin * ynbin, "wrong number of values for joint distr"
-		self.xmin = xmin
-		self.xmax = xmin + xbinWidth * xnbin
-		self.xbinWidth = xbinWidth
-		self.ymin = ymin
-		self.ymax = ymin + ybinWidth * ynbin
-		self.ybinWidth = ybinWidth
-		self.pmax = max(self.values)
-		self.values = np.array(self.values).reshape(xnbin, ynbin)
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = 0
-		while not done:
-			x = randomFloat(self.xmin, self.xmax)
-			y = randomFloat(self.ymin, self.ymax)
-			xbin = int((x - self.xmin) / self.xbinWidth)
-			ybin = int((y - self.ymin) / self.ybinWidth)
-			ap = self.values[xbin][ybin]
-			sp = randomFloat(0.0, self.pmax)
-			if (sp < ap):
-				done = True
-				samp = [x,y]
-		return samp
-class JointNormalSampler:
-	"""
-	joint normal sampler
-	"""
-	def __init__(self, *values):
-		"""
-		initializer
-		Parameters
-			values : 2 mean values followed by 4 values for covar matrix
-		"""
-		lvalues = list(values)
-		assert len(lvalues) == 6, "incorrect number of arguments for joint normal sampler"
-		mean = lvalues[:2]
-		self.mean = np.array(mean)
-		sd = lvalues[2:]
-		self.sd = np.array(sd).reshape(2,2)
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		return list(np.random.multivariate_normal(self.mean, self.sd))
-class MultiVarNormalSampler:
-	"""
-	muti variate normal sampler
-	"""
-	def __init__(self, numVar, *values):
-		"""
-		initializer
-		Parameters
-			numVar : no of variables
-			values : numVar mean values followed by numVar x numVar values for covar matrix
-		"""
-		lvalues = list(values)
-		assert len(lvalues) == numVar + numVar * numVar, "incorrect number of arguments for multi var normal sampler"
-		mean = lvalues[:numVar]
-		self.mean = np.array(mean)
-		sd = lvalues[numVar:]
-		self.sd = np.array(sd).reshape(numVar,numVar)
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		return list(np.random.multivariate_normal(self.mean, self.sd))
-class CategoricalRejectSampler:
-	"""
-	non parametric sampling for categorical attributes using given distribution based
-	on rejection sampling
-	"""
-	def __init__(self,  *values):
-		"""
-		initializer
-		Parameters
-			values : list of tuples which contains a categorical value and the corresponsding distr value
-		"""
-		self.distr = values
-		if (len(self.distr) == 1):
-			self.distr = self.distr[0]
-		maxv = 0
-		for t in self.distr:
-			if t[1] > maxv:
-				maxv = t[1]
-		self.maxv = maxv
-	def sample(self):
-		"""
-		samples value
-		"""
-		done = False
-		samp = ""
-		while not done:
-			t = self.distr[randint(0, len(self.distr)-1)]
-			d = randomFloat(0, self.maxv)
-			if (d <= t[1]):
-				done = True
-				samp = t[0]
-		return samp
-class CategoricalSetSampler:
-	"""
-	non parametric sampling for categorical attributes using uniform distribution based for
-	sampling a set of values from all values
-	"""
-	def __init__(self,  *values):
-		"""
-		initializer
-		Parameters
-			values : list which contains a categorical values
-		"""
-		self.values = values
-		if (len(self.values) == 1):
-			self.values = self.values[0]
-		self.sampled = list()
-	def sample(self):
-		"""
-		samples value only from previously unsamopled
-		"""
-		samp = selectRandomFromList(self.values)
-		while True:
-			if samp in self.sampled:
-				samp = selectRandomFromList(self.values)
-			else:
-				self.sampled.append(samp)
-				break
-		return samp
-	def setSampled(self, sampled):
-		"""
-		set already sampled
-		Parameters
-			sampled : already sampled list
-		"""
-		self.sampled  = sampled
-	def unsample(self, sample=None):
-		"""
-		rempve from sample history
-		Parameters
-			sample : sample to be removed
-		"""
-		if sample is None:
-			self.sampled.clear()
-		else:
-			self.sampled.remove(sample)
-class DistrMixtureSampler:
-	"""
-	distr mixture sampler
-	"""
-	def __init__(self,  mixtureWtDistr, *compDistr):
-		"""
-		initializer
-		Parameters
-			mixtureWtDistr : sampler that returns index into sampler list
-			compDistr : sampler list
-		"""
-		self.mixtureWtDistr = mixtureWtDistr
-		self.compDistr = compDistr
-		if (len(self.compDistr) == 1):
-			self.compDistr = self.compDistr[0]
-	def isNumeric(self):
-		return True
-	def sample(self):
-		"""
-		samples value
-		"""
-		comp = self.mixtureWtDistr.sample()
-		#sample  sampled comp distr
-		return self.compDistr[comp].sample()
-class AncestralSampler:
-	"""
-	ancestral sampler using conditional distribution
-	"""
-	def __init__(self,  parentDistr, childDistr, numChildren):
-		"""
-		initializer
-		Parameters
-			parentDistr : parent distr
-			childDistr : childdren distribution dictionary
-			numChildren : no of children
-		"""
-		self.parentDistr = parentDistr
-		self.childDistr = childDistr
-		self.numChildren = numChildren
-	def sample(self):
-		"""
-		samples value
-		"""
-		parent = self.parentDistr.sample()
-		#sample all children conditioned on parent
-		children = []
-		for i in range(self.numChildren):
-			key = (parent, i)
-			child = self.childDistr[key].sample()
-			children.append(child)
-		return (parent, children)
-class ClusterSampler:
-	"""
-	sample cluster and then sample member of sampled cluster
-	"""
-	def __init__(self,  clusters, *clustDistr):
-		"""
-		initializer
-		Parameters
-			clusters : dictionary clusters
-			clustDistr : distr for clusters
-		"""
-		self.sampler = CategoricalRejectSampler(*clustDistr)
-		self.clusters = clusters
-	def sample(self):
-		"""
-		samples value
-		"""
-		cluster = self.sampler.sample()
-		member = random.choice(self.clusters[cluster])
-		return (cluster, member)
-class MetropolitanSampler:
-	"""
-	metropolitan sampler
-	"""
-	def __init__(self, propStdDev, min, binWidth, values):
-		"""
-		initializer
-		Parameters
-			propStdDev : proposal distr std dev
-			min : min domain value for target distr
-			binWidth : bin width
-			values : target distr values
-		"""
-		self.targetDistr = Histogram.createInitialized(min, binWidth, values)
-		self.propsalDistr = GaussianRejectSampler(0, propStdDev)
-		self.proposalMixture = False
-		# bootstrap sample
-		(minv, maxv) = self.targetDistr.getMinMax()
-		self.curSample = random.randint(minv, maxv)
-		self.curDistr = self.targetDistr.value(self.curSample)
-		self.transCount = 0
-	def initialize(self):
-		"""
-		initialize
-		"""
-		(minv, maxv) = self.targetDistr.getMinMax()
-		self.curSample = random.randint(minv, maxv)
-		self.curDistr = self.targetDistr.value(self.curSample)
-		self.transCount = 0
-	def setProposalDistr(self, propsalDistr):
-		"""
-		set custom proposal distribution
-		Parameters
-			propsalDistr : proposal distribution
-		"""
-		self.propsalDistr = propsalDistr
-	def setGlobalProposalDistr(self, globPropStdDev, proposalChoiceThreshold):
-		"""
-		set custom proposal distribution
-		Parameters
-			globPropStdDev : global proposal distr std deviation
-			proposalChoiceThreshold : threshold for using global proposal distribution
-		"""
-		self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)
-		self.proposalChoiceThreshold = proposalChoiceThreshold
-		self.proposalMixture = True
-	def sample(self):
-		"""
-		samples value
-		"""
-		nextSample = self.proposalSample(1)
-		self.targetSample(nextSample)
-		return self.curSample;
-	def proposalSample(self, skip):
-		"""
-		sample from proposal distribution
-		Parameters
-			skip : no of samples to skip
-		"""
-		for i in range(skip):
-			if not self.proposalMixture:
-				#one proposal distr
-				nextSample = self.curSample + self.propsalDistr.sample()
-				nextSample = self.targetDistr.boundedValue(nextSample)
-			else:
-				#mixture of proposal distr
-				if random.random() < self.proposalChoiceThreshold:
-					nextSample = self.curSample + self.propsalDistr.sample()
-				else:
-					nextSample = self.curSample + self.globalProposalDistr.sample()
-				nextSample = self.targetDistr.boundedValue(nextSample)
-		return nextSample
-	def targetSample(self, nextSample):
-		"""
-		target sample
-		Parameters
-			nextSample : proposal distr sample
-		"""
-		nextDistr = self.targetDistr.value(nextSample)
-		transition = False
-		if nextDistr > self.curDistr:
-			transition = True
-		else:
-			distrRatio = float(nextDistr) / self.curDistr
-			if random.random() < distrRatio:
-				transition = True
-		if transition:
-			self.curSample = nextSample
-			self.curDistr = nextDistr
-			self.transCount += 1
-	def subSample(self, skip):
-		"""
-		sub sample
-		Parameters
-			skip : no of samples to skip
-		"""
-		nextSample = self.proposalSample(skip)
-		self.targetSample(nextSample)
-		return self.curSample;
-	def setMixtureProposal(self, globPropStdDev, mixtureThreshold):
-		"""
-		mixture proposal
-		Parameters
-			globPropStdDev : global proposal distr std deviation
-			mixtureThreshold : threshold for using global proposal distribution
-		"""
-		self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)
-		self.mixtureThreshold = mixtureThreshold
-	def samplePropsal(self):
-		"""
-		sample from proposal distr
-		"""
-		if self.globalPropsalDistr is None:
-			proposal = self.propsalDistr.sample()
-		else:
-			if random.random() < self.mixtureThreshold:
-				proposal = self.propsalDistr.sample()
-			else:
-				proposal = self.globalProposalDistr.sample()
-		return proposal
-class PermutationSampler:
-	"""
-	permutation sampler by shuffling a list
-	"""
-	def __init__(self):
-		"""
-		initialize
-		"""
-		self.values = None
-		self.numShuffles = None
-	@staticmethod
-	def createSamplerWithValues(values, *numShuffles):
-		"""
-		creator with values
-		Parameters
-			values : list data
-			numShuffles : no of shuffles or range of no of shuffles
-		"""
-		sampler = PermutationSampler()
-		sampler.values = values
-		sampler.numShuffles = numShuffles
-		return sampler
-	@staticmethod
-	def createSamplerWithRange(minv, maxv, *numShuffles):
-		"""
-		creator with ramge min and max
-		Parameters
-			minv : min of range
-			maxv : max of range
-			numShuffles : no of shuffles or range of no of shuffles
-		"""
-		sampler = PermutationSampler()
-		sampler.values = list(range(minv, maxv + 1))
-		sampler.numShuffles = numShuffles
-		return sampler
-	def sample(self):
-		"""
-		sample new permutation
-		"""
-		cloned = self.values.copy()
-		shuffle(cloned, *self.numShuffles)
-		return cloned
-class SpikeyDataSampler:
-	"""
-	samples spikey data
-	"""
-	def __init__(self, intvMean, intvScale, distr, spikeValueMean, spikeValueStd, spikeMaxDuration, baseValue = 0):
-		"""
-		initializer
-		Parameters
-			intvMean : interval mean
-			intvScale : interval std dev
-			distr : type of distr for interval
-			spikeValueMean : spike value mean
-			spikeValueStd : spike value std dev
-			spikeMaxDuration : max duration for spike
-			baseValue : base or offset value
-		"""
-		if distr == "norm":
-			self.intvSampler = NormalSampler(intvMean, intvScale)
-		elif distr == "expo":
-			rate = 1.0 / intvScale
-			self.intvSampler = ExponentialSampler(rate)
-		else:
-			raise ValueError("invalid distribution")
-		self.spikeSampler = NormalSampler(spikeValueMean, spikeValueStd)
-		self.spikeMaxDuration = spikeMaxDuration
-		self.baseValue = baseValue
-		self.inSpike = False
-		self.spikeCount = 0
-		self.baseCount = 0
-		self.baseLength = int(self.intvSampler.sample())
-		self.spikeValues = list()
-		self.spikeLength = None
-	def sample(self):
-		"""
-		sample new value
-		"""
-		if self.baseCount <= self.baseLength:
-			sampled = self.baseValue
-			self.baseCount += 1
-		else:
-			if not self.inSpike:
-				#starting spike
-				spikeVal = self.spikeSampler.sample()
-				self.spikeLength = sampleUniform(1, self.spikeMaxDuration)
-				spikeMaxPos = 0 if self.spikeLength == 1 else sampleUniform(0, self.spikeLength-1)
-				self.spikeValues.clear()
-				for i in range(self.spikeLength):
-					if i < spikeMaxPos:
-						frac = (i + 1) / (spikeMaxPos + 1)
-						frac = sampleFloatFromBase(frac, 0.1 * frac)
-					elif i > spikeMaxPos:
-						frac =  (self.spikeLength - i) / (self.spikeLength - spikeMaxPos)
-						frac = sampleFloatFromBase(frac, 0.1 * frac)
-					else:
-						frac = 1.0
-					self.spikeValues.append(frac * spikeVal)
-					self.inSpike = True
-					self.spikeCount = 0
-			sampled = self.spikeValues[self.spikeCount]
-			self.spikeCount += 1
-			if self.spikeCount == self.spikeLength:
-				#ending spike
-				self.baseCount = 0
-				self.baseLength = int(self.intvSampler.sample())
-				self.inSpike = False
-		return sampled
-class EventSampler:
-	"""
-	sample event
-	"""
-	def __init__(self, intvSampler, valSampler=None):
-		"""
-		initializer
-		Parameters
-			intvSampler : interval sampler
-			valSampler : value sampler
-		"""
-		self.intvSampler = intvSampler
-		self.valSampler = valSampler
-		self.trigger = int(self.intvSampler.sample())
-		self.count = 0
-	def reset(self):
-		"""
-		reset trigger
-		"""
-		self.trigger = int(self.intvSampler.sample())
-		self.count = 0
-	def sample(self):
-		"""
-		sample event
-		"""
-		if self.count == self.trigger:
-			sampled = self.valSampler.sample() if self.valSampler is not None else 1.0
-			self.trigger = int(self.intvSampler.sample())
-			self.count = 0
-		else:
-			sample = 0.0
-			self.count += 1
-		return sampled
-def createSampler(data):
-	"""
-	create sampler
-	Parameters
-		data : sampler description
-	"""
-	#print(data)
-	items = data.split(":")
-	size = len(items)
-	dtype = items[-1]
-	stype = items[-2]
-	#print("sampler data {}".format(data))
-	#print("sampler {}".format(stype))
-	sampler = None
-	if stype == "uniform":
-		if dtype == "int":
-			min = int(items[0])
-			max = int(items[1])
-			sampler = UniformNumericSampler(min, max)
-		elif dtype == "float":
-			min = float(items[0])
-			max = float(items[1])
-			sampler = UniformNumericSampler(min, max)
-		elif dtype == "categorical":
-			values = items[:-2]
-			sampler = UniformCategoricalSampler(values)
-	elif stype == "normal":
-			mean = float(items[0])
-			sd = float(items[1])
-			sampler = NormalSampler(mean, sd)
-			if dtype == "int":
-				sampler.sampleAsIntValue()
-	elif stype == "nonparam":
-		if dtype == "int" or dtype == "float":
-			min = int(items[0])
-			binWidth = int(items[1])
-			values = items[2:-2]
-			values = list(map(lambda v: int(v), values))
-			sampler = NonParamRejectSampler(min, binWidth, values)
-			if dtype == "float":
-				sampler.sampleAsFloat()
-		elif dtype == "categorical":
-			values = list()
-			for i in range(0, size-2, 2):
-				cval = items[i]
-				dist = int(items[i+1])
-				pair = (cval, dist)
-				values.append(pair)
-			sampler = CategoricalRejectSampler(values)
-		elif dtype == "scategorical":
-			vfpath = items[0]
-			values = getFileLines(vfpath, None)
-			sampler = CategoricalSetSampler(values)
-	elif stype == "discrete":
-		vmin = int(items[0])
-		vmax = int(items[1])
-		step = int(items[2])
-		values = list(map(lambda i : int(items[i]), range(3, len(items)-2)))
-		sampler = DiscreteRejectSampler(vmin, vmax, step, values)
-	elif stype == "bernauli":
-		pr = float(items[0])
-		events = None
-		if len(items) == 5:
-			events = list()
-			if dtype == "int":
-				events.append(int(items[1]))
-				events.append(int(items[2]))
-			elif dtype == "categorical":
-				events.append(items[1])
-				events.append(items[2])
-		sampler = BernoulliTrialSampler(pr, events)
-	else:
-		raise ValueError("invalid sampler type " + stype)
-	return sampler

matumizi/matumizi/stats.py DELETED Viewed

@@ -1,496 +0,0 @@
-#!/usr/local/bin/python3
-# avenir-python: Machine Learning
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import sys
-import random
-import time
-import math
-import numpy as np
-import statistics
-from .util import *
-"""
-histogram class
-"""
-class Histogram:
-	def __init__(self, min, binWidth):
-		"""
-    	initializer
-		Parameters
-			min : min x
-			binWidth : bin width
-    	"""
-		self.xmin = min
-		self.binWidth = binWidth
-		self.normalized = False
-	@classmethod
-	def createInitialized(cls, xmin, binWidth, values):
-		"""
-    	create histogram instance with min domain, bin width and values
-		Parameters
-			min : min x
-			binWidth : bin width
-			values : y values
-    	"""
-		instance = cls(xmin, binWidth)
-		instance.xmax = xmin + binWidth * (len(values) - 1)
-		instance.ymin = 0
-		instance.bins = np.array(values)
-		instance.fmax = 0
-		for v in values:
-			if (v > instance.fmax):
-				instance.fmax = v
-		instance.ymin = 0.0
-		instance.ymax = instance.fmax
-		return instance
-	@classmethod
-	def createWithNumBins(cls, values, numBins=20):
-		"""
-    	create histogram instance values and no of bins
-		Parameters
-			values : y values
-			numBins : no of bins
-		"""
-		xmin = min(values)
-		xmax = max(values)
-		binWidth = (xmax + .01 - (xmin - .01)) / numBins
-		instance = cls(xmin, binWidth)
-		instance.xmax = xmax
-		instance.numBin = numBins
-		instance.bins = np.zeros(instance.numBin)
-		for v in values:
-			instance.add(v)
-		return instance
-	@classmethod
-	def createUninitialized(cls, xmin, xmax, binWidth):
-		"""
-    	create histogram instance with no y values using domain min , max and bin width
-		Parameters
-			min : min x
-			max : max x
-			binWidth : bin width
-    	"""
-		instance = cls(xmin, binWidth)
-		instance.xmax = xmax
-		instance.numBin = (xmax - xmin) / binWidth + 1
-		instance.bins = np.zeros(instance.numBin)
-		return instance
-	def initialize(self):
-		"""
-    	set y values to 0
-    	"""
-		self.bins = np.zeros(self.numBin)
-	def add(self, value):
-		"""
-    	adds a value to a bin
-		Parameters
-			value : value
-    	"""
-		bin = int((value - self.xmin) / self.binWidth)
-		if (bin < 0 or  bin > self.numBin - 1):
-			print (bin)
-			raise ValueError("outside histogram range")
-		self.bins[bin] += 1.0
-	def normalize(self):
-		"""
-    	normalize  bin counts
-    	"""
-		if not self.normalized:
-			total = self.bins.sum()
-			self.bins = np.divide(self.bins, total)
-			self.normalized = True
-	def cumDistr(self):
-		"""
-    	cumulative dists
-    	"""
-		self.normalize()
-		self.cbins = np.cumsum(self.bins)
-		return self.cbins
-	def distr(self):
-		"""
-    	distr
-    	"""
-		self.normalize()
-		return self.bins
-	def percentile(self, percent):
-		"""
-    	return value corresponding to a percentile
-		Parameters
-			percent : percentile value
-    	"""
-		if self.cbins is None:
-			raise ValueError("cumulative distribution is not available")
-		for i,cuml in enumerate(self.cbins):
-			if percent > cuml:
-				value = (i * self.binWidth) - (self.binWidth / 2) + \
-				(percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1])
-				break
-		return value
-	def max(self):
-		"""
-    	return max bin value
-    	"""
-		return self.bins.max()
-	def value(self, x):
-		"""
-    	return a bin value
-		Parameters
-			x : x value
-   		"""
-		bin = int((x - self.xmin) / self.binWidth)
-		f = self.bins[bin]
-		return f
-	def bin(self, x):
-		"""
-    	return a bin index
-		Parameters
-			x : x value
-   		"""
-		return int((x - self.xmin) / self.binWidth)
-	def cumValue(self, x):
-		"""
-    	return a cumulative bin value
-		Parameters
-			x : x value
-   		"""
-		bin = int((x - self.xmin) / self.binWidth)
-		c = self.cbins[bin]
-		return c
-	def getMinMax(self):
-		"""
-    	returns x min and x max
-    	"""
-		return (self.xmin, self.xmax)
-	def boundedValue(self, x):
-		"""
-    	return x bounde by min and max
-		Parameters
-			x : x value
-   		"""
-		if x < self.xmin:
-			x = self.xmin
-		elif x > self.xmax:
-			x = self.xmax
-		return x
-"""
-categorical histogram class
-"""
-class CatHistogram:
-	def __init__(self):
-		"""
-    	initializer
-    	"""
-		self.binCounts = dict()
-		self.counts = 0
-		self.normalized = False
-	def add(self, value):
-		"""
-		adds a value to a bin
-		Parameters
-			x : x value
-		"""
-		addToKeyedCounter(self.binCounts, value)
-		self.counts += 1
-	def normalize(self):
-		"""
-		normalize
-		"""
-		if not self.normalized:
-			self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))
-			self.normalized = True
-	def getMode(self):
-		"""
-		get mode
-		"""
-		maxk = None
-		maxv = 0
-		#print(self.binCounts)
-		for  k,v  in  self.binCounts.items():
-			if v > maxv:
-				maxk = k
-				maxv = v
-		return (maxk, maxv)
-	def getEntropy(self):
-		"""
-		get entropy
-		"""
-		self.normalize()
-		entr = 0
-		#print(self.binCounts)
-		for  k,v  in  self.binCounts.items():
-			entr -= v * math.log(v)
-		return entr
-	def getUniqueValues(self):
-		"""
-		get unique values
-		"""
-		return list(self.binCounts.keys())
-	def getDistr(self):
-		"""
-		get distribution
-		"""
-		self.normalize()
-		return self.binCounts.copy()
-class RunningStat:
-	"""
-	running stat class
-	"""
-	def __init__(self):
-   		"""
-    	initializer
-   		"""
-   		self.sum = 0.0
-   		self.sumSq = 0.0
-   		self.count = 0
-	@staticmethod
-	def create(count, sum, sumSq):
-		"""
-    	creates iinstance
-		Parameters
-			sum : sum of values
-			sumSq : sum of valure squared
-		"""
-		rs = RunningStat()
-		rs.sum = sum
-		rs.sumSq = sumSq
-		rs.count = count
-		return rs
-	def add(self, value):
-		"""
-		adds new value
-		Parameters
-			value : value to add
-		"""
-		self.sum += value
-		self.sumSq += (value * value)
-		self.count += 1
-	def getStat(self):
-		"""
-		return mean and std deviation
-		"""
-		mean = self.sum /self. count
-		t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
-		sd = math.sqrt(t)
-		re = (mean, sd)
-		return re
-	def addGetStat(self,value):
-		"""
-		calculate mean and std deviation with new value added
-		Parameters
-			value : value to add
-		"""
-		self.add(value)
-		re = self.getStat()
-		return re
-	def getCount(self):
-		"""
-		return count
-		"""
-		return self.count
-	def getState(self):
-		"""
-		return state
-		"""
-		s = (self.count, self.sum, self.sumSq)
-		return s
-class SlidingWindowStat:
-	"""
-	sliding window stats
-	"""
-	def __init__(self):
-		"""
-		initializer
-		"""
-		self.sum = 0.0
-		self.sumSq = 0.0
-		self.count = 0
-		self.values = None
-	@staticmethod
-	def create(values, sum, sumSq):
-		"""
-    	creates iinstance
-		Parameters
-			sum : sum of values
-			sumSq : sum of valure squared
-		"""
-		sws = SlidingWindowStat()
-		sws.sum = sum
-		sws.sumSq = sumSq
-		self.values = values.copy()
-		sws.count = len(self.values)
-		return sws
-	@staticmethod
-	def initialize(values):
-		"""
-    	creates iinstance
-		Parameters
-			values : list of values
-		"""
-		sws = SlidingWindowStat()
-		sws.values = values.copy()
-		for v in sws.values:
-			sws.sum += v
-			sws.sumSq += v * v
-		sws.count = len(sws.values)
-		return sws
-	@staticmethod
-	def createEmpty(count):
-		"""
-    	creates iinstance
-		Parameters
-			count : count of values
-		"""
-		sws = SlidingWindowStat()
-		sws.count = count
-		sws.values = list()
-		return sws
-	def add(self, value):
-		"""
-		adds new value
-		Parameters
-			value : value to add
-		"""
-		self.values.append(value)
-		if len(self.values) > self.count:
-			self.sum += value - self.values[0]
-			self.sumSq += (value * value) - (self.values[0] * self.values[0])
-			self.values.pop(0)
-		else:
-			self.sum += value
-			self.sumSq += (value * value)
-	def getStat(self):
-		"""
-		calculate mean and std deviation
-		"""
-		mean = self.sum /self. count
-		t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
-		sd = math.sqrt(t)
-		re = (mean, sd)
-		return re
-	def addGetStat(self,value):
-		"""
-		calculate mean and std deviation with new value added
-		"""
-		self.add(value)
-		re = self.getStat()
-		return re
-	def getCount(self):
-		"""
-		return count
-		"""
-		return self.count
-	def getCurSize(self):
-		"""
-		return count
-		"""
-		return len(self.values)
-	def getState(self):
-		"""
-		return state
-		"""
-		s = (self.count, self.sum, self.sumSq)
-		return s
-def basicStat(ldata):
-	"""
-	mean and std dev
-	Parameters
-		ldata : list of values
-	"""
-	m = statistics.mean(ldata)
-	s = statistics.stdev(ldata, xbar=m)
-	r = (m, s)
-	return r
-def getFileColumnStat(filePath, col, delem=","):
-	"""
-	gets stats for a file column
-	Parameters
-		filePath : file path
-		col : col index
-		delem : field delemter
-	"""
-	rs = RunningStat()
-	for rec in fileRecGen(filePath, delem):
-		va = float(rec[col])
-		rs.add(va)
-	return rs.getStat()

matumizi/matumizi/util.py DELETED Viewed

@@ -1,2345 +0,0 @@
-#!/usr/local/bin/python3
-# Author: Pranab Ghosh
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you
-# may not use this file except in compliance with the License. You may
-# obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import os
-import sys
-from random import randint
-import random
-import time
-import uuid
-from datetime import datetime
-import math
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-import logging
-import logging.handlers
-import pickle
-from contextlib import contextmanager
-tokens = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F","G","H","I","J","K","L","M",
-	"N","O","P","Q","R","S","T","U","V","W","X","Y","Z","0","1","2","3","4","5","6","7","8","9"]
-numTokens = tokens[:10]
-alphaTokens = tokens[10:36]
-loCaseChars = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k","l","m","n","o",
-"p","q","r","s","t","u","v","w","x","y","z"]
-typeInt = "int"
-typeFloat = "float"
-typeString = "string"
-secInMinute = 60
-secInHour = 60 * 60
-secInDay = 24 * secInHour
-secInWeek = 7 * secInDay
-secInYear = 365 * secInDay
-secInMonth = secInYear / 12
-minInHour = 60
-minInDay = 24 * minInHour
-ftPerYard = 3
-ftPerMile = ftPerYard * 1760
-def genID(size):
-	"""
-	generates ID
-	Parameters
-		size : size of ID
-	"""
-	id = ""
-	for i in range(size):
-		id = id + selectRandomFromList(tokens)
-	return id
-def genIdList(numId, idSize):
-	"""
-	generate list of IDs
-	Parameters:
-		numId: number of Ids
-		idSize: ID size
-	"""
-	iDs = []
-	for i in range(numId):
-		iDs.append(genID(idSize))
-	return iDs
-def genNumID(size):
-	"""
-	generates ID consisting of digits onl
-	Parameters
-		size : size of ID
-	"""
-	id = ""
-	for i in range(size):
-		id = id + selectRandomFromList(numTokens)
-	return id
-def genLowCaseID(size):
-	"""
-	generates ID consisting of lower case chars
-	Parameters
-		size : size of ID
-	"""
-	id = ""
-	for i in range(size):
-		id = id + selectRandomFromList(loCaseChars)
-	return id
-def genNumIdList(numId, idSize):
-	"""
-	generate list of numeric IDs
-	Parameters:
-		numId: number of Ids
-		idSize: ID size
-	"""
-	iDs = []
-	for i in range(numId):
-		iDs.append(genNumID(idSize))
-	return iDs
-def genNameInitial():
-	"""
-	generate name initial
-	"""
-	return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)
-def genPhoneNum(arCode):
-	"""
-	generates phone number
-	Parameters
-		arCode: area code
-	"""
-	phNum = genNumID(7)
-	return arCode + str(phNum)
-def selectRandomFromList(ldata):
-	"""
-	select an element randomly from a lis
-	Parameters
-		ldata : list data
-	"""
-	return ldata[randint(0, len(ldata)-1)]
-def selectOtherRandomFromList(ldata, cval):
-	"""
-	select an element randomly from a list excluding the given one
-	Parameters
-		ldata : list data
-		cval : value to be excluded
-	"""
-	nval = selectRandomFromList(ldata)
-	while nval == cval:
-		nval = selectRandomFromList(ldata)
-	return nval
-def selectRandomSubListFromList(ldata, num):
-	"""
-	generates random sublist from a list without replacemment
-	Parameters
-		ldata : list data
-		num : output list size
-	"""
-	assertLesser(num, len(ldata), "size of sublist to be sampled greater than or equal to main list")
-	i = randint(0, len(ldata)-1)
-	sel = ldata[i]
-	selSet = {i}
-	selList = [sel]
-	while (len(selSet) < num):
-		i = randint(0, len(ldata)-1)
-		if (i not in selSet):
-			sel = ldata[i]
-			selSet.add(i)
-			selList.append(sel)
-	return selList
-def selectRandomSubListFromListWithRepl(ldata, num):
-	"""
-	generates random sublist from a list with replacemment
-	Parameters
-		ldata : list data
-		num : output list size
-	"""
-	return list(map(lambda i : selectRandomFromList(ldata), range(num)))
-def selectRandomFromDict(ddata):
-	"""
-	select an element randomly from a dictionary
-	Parameters
-		ddata : dictionary data
-	"""
-	dkeys = list(ddata.keys())
-	dk = selectRandomFromList(dkeys)
-	el = (dk, ddata[dk])
-	return el
-def setListRandomFromList(ldata, ldataRepl):
-	"""
-	sets some elents in the first list randomly with elements from the second list
-	Parameters
-		ldata : list data
-		ldataRepl : list with replacement data
-	"""
-	l = len(ldata)
-	selSet = set()
-	for d in ldataRepl:
-		i = randint(0, l-1)
-		while i in selSet:
-			i = randint(0, l-1)
-		ldata[i] = d
-		selSet.add(i)
-def genIpAddress():
-	"""
-	generates IP address
-	"""
-	i1 = randint(0,256)
-	i2 = randint(0,256)
-	i3 = randint(0,256)
-	i4 = randint(0,256)
-	ip = "%d.%d.%d.%d" %(i1,i2,i3,i4)
-	return ip
-def curTimeMs():
-	"""
-	current time in ms
-	"""
-	return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)
-def secDegPolyFit(x1, y1, x2, y2, x3, y3):
-	"""
-	second deg polynomial
-	Parameters
-		x1 : 1st point x
-		y1 : 1st point y
-		x2 : 2nd point x
-		y2 : 2nd point y
-		x3 : 3rd point x
-		y3 : 3rd point y
-	"""
-	t = (y1 - y2) / (x1 - x2)
-	a = t - (y2 - y3) / (x2 - x3)
-	a = a / (x1 - x3)
-	b = t - a * (x1 + x2)
-	c = y1 - a * x1 * x1 - b * x1
-	return (a, b, c)
-def range_limit(val, minv, maxv):
-	"""
-	range limit a value
-	Parameters
-		val : data value
-		minv : minimum
-		maxv : maximum
-	"""
-	if (val < minv):
-		val = minv
-	elif (val > maxv):
-		val = maxv
-	return val
-def rangeLimit(val, minv, maxv):
-	"""
-	range limit a value
-	Parameters
-		val : data value
-		minv : minimum
-		maxv : maximum
-	"""
-	return range_limit(val, minv, maxv)
-def isInRange(val, minv, maxv):
-	"""
-	checks if within range
-	Parameters
-		val : data value
-		minv : minimum
-		maxv : maximum
-	"""
-	return val >= minv and val <= maxv
-def stripFileLines(filePath, offset):
-	"""
-	strips number of chars from both ends
-	Parameters
-		filePath : file path
-		offset : offset from both ends of  line
-	"""
-	fp = open(filePath, "r")
-	for line in fp:
-		stripped = line[offset:len(line) - 1 - offset]
-		print (stripped)
-	fp.close()
-def genLatLong(lat1, long1, lat2, long2):
-	"""
-	generate lat log within limits
-	Parameters
-		lat1 : lat of 1st point
-		long1 : long of 1st point
-		lat2 : lat of 2nd point
-		long2 : long of 2nd point
-	"""
-	lat = lat1 + (lat2 - lat1) * random.random()
-	longg = long1 + (long2 - long1) * random.random()
-	return (lat, longg)
-def geoDistance(lat1, long1, lat2, long2):
-	"""
-	find geo distance in ft
-	Parameters
-		lat1 : lat of 1st point
-		long1 : long of 1st point
-		lat2 : lat of 2nd point
-		long2 : long of 2nd point
-	"""
-	latDiff = math.radians(lat1 - lat2)
-	longDiff = math.radians(long1 - long2)
-	l1 = math.sin(latDiff/2.0)
-	l2 = math.sin(longDiff/2.0)
-	l3 = math.cos(math.radians(lat1))
-	l4 = math.cos(math.radians(lat2))
-	a = l1 * l1 + l3 * l4 * l2 * l2
-	l5 = math.sqrt(a)
-	l6 = math.sqrt(1.0 - a)
-	c = 2.0 * math.atan2(l5, l6)
-	r = 6371008.8 * 3.280840
-	return c * r
-def minLimit(val, limit):
-	"""
-	min limit
-	Parameters
-	"""
-	if (val < limit):
-		val = limit
-	return val;
-def maxLimit(val, limit):
-	"""
-	max limit
-	Parameters
-	"""
-	if (val > limit):
-		val = limit
-	return val;
-def rangeSample(val, minLim, maxLim):
-	"""
-	if out side range sample within range
-	Parameters
-		val : value
-		minLim : minimum
-		maxLim : maximum
-	"""
-	if val < minLim or val > maxLim:
-		val = randint(minLim, maxLim)
-	return val
-def genRandomIntListWithinRange(size, minLim, maxLim):
-	"""
-	random unique list of integers within range
-	Parameters
-		size : size of returned list
-		minLim : minimum
-		maxLim : maximum
-	"""
-	values = set()
-	for i in range(size):
-		val = randint(minLim, maxLim)
-		while val not in values:
-			values.add(val)
-	return list(values)
-def preturbScalar(value, vrange, distr="uniform"):
-	"""
-	preturbs a mutiplicative value within range
-	Parameters
-		value : data value
-		vrange : value delta  fraction
-		distr : noise distribution type
-	"""
-	if distr == "uniform":
-		scale = 1.0 - vrange + 2 * vrange * random.random()
-	elif distr == "normal":
-		scale = 1.0 + np.random.normal(0, vrange)
-	else:
-		exisWithMsg("unknown noise distr " + distr)
-	return value * scale
-def preturbScalarAbs(value, vrange):
-	"""
-	preturbs an absolute value within range
-	Parameters
-		value : data value
-		vrange : value delta  absolute
-	"""
-	delta = - vrange + 2.0 * vrange * random.random()
-	return value + delta
-def preturbVector(values, vrange):
-	"""
-	preturbs a list within range
-	Parameters
-		values : list data
-		vrange : value delta  fraction
-	"""
-	nValues = list(map(lambda va: preturbScalar(va, vrange), values))
-	return nValues
-def randomShiftVector(values, smin, smax):
-	"""
-	shifts  a list by a random quanity with a range
-	Parameters
-		values : list data
-		smin : samplinf minimum
-		smax : sampling maximum
-	"""
-	shift = np.random.uniform(smin, smax)
-	return list(map(lambda va: va + shift, values))
-def floatRange(beg, end, incr):
-	"""
-	generates float range
-	Parameters
-		beg :range begin
-		end: range end
-		incr : range increment
-	"""
-	return list(np.arange(beg, end, incr))
-def shuffle(values, *numShuffles):
-	"""
-	in place shuffling with swap of pairs
-	Parameters
-		values : list data
-		numShuffles : parameter list for number of shuffles
-	"""
-	size = len(values)
-	if len(numShuffles) == 0:
-		numShuffle = int(size / 2)
-	elif len(numShuffles) == 1:
-		numShuffle = numShuffles[0]
-	else:
-		numShuffle = randint(numShuffles[0], numShuffles[1])
-	print("numShuffle {}".format(numShuffle))
-	for i in range(numShuffle):
-		first = random.randint(0, size - 1)
-		second = random.randint(0, size - 1)
-		while first == second:
-			second = random.randint(0, size - 1)
-		tmp = values[first]
-		values[first] = values[second]
-		values[second] = tmp
-def splitList(itms, numGr):
-	"""
-	splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen
-	Parameters
-		itms ; list of values
-		numGr : no of groups
-	"""
-	tcount = len(itms)
-	cItems = list(itms)
-	sz = int(len(cItems) / numGr)
-	groups = list()
-	count = 0
-	for i in range(numGr):
-		if (i == numGr - 1):
-			csz = tcount - count
-		else:
-			csz = sz + randint(-2, 2)
-			count += csz
-		gr = list()
-		for  j in range(csz):
-			it = selectRandomFromList(cItems)
-			gr.append(it)
-			cItems.remove(it)
-		groups.append(gr)
-	return groups
-def multVector(values, vrange):
-	"""
-	multiplies a list within value  range
-	Parameters
-		values : list of values
-		vrange : fraction of vaue to be used to update
-	"""
-	scale = 1.0 - vrange + 2 * vrange * random.random()
-	nValues = list(map(lambda va: va * scale, values))
-	return nValues
-def weightedAverage(values, weights):
-	"""
-	calculates weighted average
-	Parameters
-		values : list of values
-		weights : list of weights
-	"""
-	assert len(values) == len(weights), "values and weights should be same size"
-	vw = zip(values, weights)
-	wva = list(map(lambda e : e[0] * e[1], vw))
-	#wa = sum(x * y for x, y in vw) / sum(weights)
-	wav = sum(wva) / sum(weights)
-	return wav
-def extractFields(line, delim, keepIndices):
-	"""
-	breaks a line into fields and keeps only specified fileds and returns new line
-	Parameters
-		line ; deli separated string
-		delim : delemeter
-		keepIndices : list of indexes to fields to be retained
-	"""
-	items = line.split(delim)
-	newLine = []
-	for i in keepIndices:
-		newLine.append(line[i])
-	return delim.join(newLine)
-def remFields(line, delim, remIndices):
-	"""
-	removes fields from delim separated string
-	Parameters
-		line ; delemeter separated string
-		delim : delemeter
-		remIndices : list of indexes to fields to be removed
-	"""
-	items = line.split(delim)
-	newLine = []
-	for i in range(len(items)):
-		if not arrayContains(remIndices, i):
-			newLine.append(line[i])
-	return delim.join(newLine)
-def extractList(data, indices):
-	"""
-	extracts list from another list, given indices
-	Parameters
-		remIndices : list data
-		indices : list of indexes to fields to be retained
-	"""
-	if areAllFieldsIncluded(data, indices):
-		exList = data.copy()
-		#print("all indices")
-	else:
-		exList = list()
-		le = len(data)
-		for i in indices:
-			assert i < le , "index {} out of bound {}".format(i, le)
-			exList.append(data[i])
-	return exList
-def arrayContains(arr, item):
-	"""
-	checks if array contains an item
-	Parameters
-		arr : list data
-		item : item to search
-	"""
-	contains = True
-	try:
-		arr.index(item)
-	except ValueError:
-		contains = False
-	return contains
-def strToIntArray(line, delim=","):
-	"""
-	int array from delim separated string
-	Parameters
-		line ; delemeter separated string
-	"""
-	arr = line.split(delim)
-	return [int(a) for a in arr]
-def strToFloatArray(line, delim=","):
-	"""
-	float array from delim separated string
-	Parameters
-		line ; delemeter separated string
-	"""
-	arr = line.split(delim)
-	return [float(a) for a in arr]
-def strListOrRangeToIntArray(line):
-	"""
-	int array from delim separated string or range
-	Parameters
-		line ; delemeter separated string
-	"""
-	varr = line.split(",")
-	if (len(varr) > 1):
-		iarr =  list(map(lambda v: int(v), varr))
-	else:
-		vrange = line.split(":")
-		if (len(vrange) == 2):
-			lo = int(vrange[0])
-			hi = int(vrange[1])
-			iarr = list(range(lo, hi+1))
-		else:
-			iarr = [int(line)]
-	return iarr
-def toStr(val, precision):
-	"""
-	converts any type to string
-	Parameters
-		val : value
-		precision ; precision for float value
-	"""
-	if type(val) == float or type(val) == np.float64 or type(val) == np.float32:
-		format = "%" + ".%df" %(precision)
-		sVal = format %(val)
-	else:
-		sVal = str(val)
-	return sVal
-def toStrFromList(values, precision, delim=","):
-	"""
-	converts list of any type to delim separated string
-	Parameters
-		values : list data
-		precision ; precision for float value
-		delim : delemeter
-	"""
-	sValues = list(map(lambda v: toStr(v, precision), values))
-	return delim.join(sValues)
-def toIntList(values):
-	"""
-	convert to int list
-	Parameters
-		values : list data
-	"""
-	return list(map(lambda va: int(va), values))
-def toFloatList(values):
-	"""
-	convert to float list
-	Parameters
-		values : list data
-	"""
-	return list(map(lambda va: float(va), values))
-def toStrList(values, precision=None):
-	"""
-	convert to string list
-	Parameters
-		values : list data
-		precision ; precision for float value
-	"""
-	return list(map(lambda va: toStr(va, precision), values))
-def toIntFromBoolean(value):
-	"""
-	convert to int
-	Parameters
-		value : boolean value
-	"""
-	ival = 1 if value else 0
-	return ival
-def scaleBySum(ldata):
-	"""
-	scales so that sum is 1
-	Parameters
-		ldata : list data
-	"""
-	s = sum(ldata)
-	return list(map(lambda e : e/s, ldata))
-def scaleByMax(ldata):
-	"""
-	scales so that max value is 1
-	Parameters
-		ldata : list data
-	"""
-	m = max(ldata)
-	return list(map(lambda e : e/m, ldata))
-def typedValue(val, dtype=None):
-	"""
-	return typed value given string, discovers data type if not specified
-	Parameters
-		val : value
-		dtype : data type
-	"""
-	tVal = None
-	if dtype is not None:
-		if dtype == "num":
-			dtype = "int" if dtype.find(".") == -1 else "float"
-		if dtype == "int":
-			tVal = int(val)
-		elif dtype == "float":
-			tVal = float(val)
-		elif dtype == "bool":
-			tVal = bool(val)
-		else:
-			tVal = val
-	else:
-		if type(val) == str:
-			lVal = val.lower()
-			#int
-			done = True
-			try:
-				tVal = int(val)
-			except ValueError:
-				done = False
-			#float
-			if not done:
-				done = True
-				try:
-					tVal = float(val)
-				except ValueError:
-					done = False
-			#boolean
-			if not done:
-				done = True
-				if lVal == "true":
-					tVal = True
-				elif lVal == "false":
-					tVal = False
-				else:
-					done = False
-			#None
-			if not done:
-				if lVal == "none":
-					tVal = None
-				else:
-					tVal = val
-		else:
-			tVal = val
-	return tVal
-def isInt(val):
-	"""
-	return true if string is int and the typed value
-	Parameters
-		val : value
-	"""
-	valInt = True
-	try:
-		tVal = int(val)
-	except ValueError:
-		valInt = False
-		tVal = None
-	r = (valInt, tVal)
-	return r
-def isFloat(val):
-	"""
-	return true if string is float
-	Parameters
-		val : value
-	"""
-	valFloat = True
-	try:
-		tVal = float(val)
-	except ValueError:
-		valFloat = False
-	tVal = None
-	r = (valFloat, tVal)
-	return r
-def getAllFiles(dirPath):
-	"""
-	get all files recursively
-	Parameters
-		dirPath : directory path
-	"""
-	filePaths = []
-	for (thisDir, subDirs, fileNames) in os.walk(dirPath):
-		for fileName in fileNames:
-			filePaths.append(os.path.join(thisDir, fileName))
-	filePaths.sort()
-	return filePaths
-def getFileContent(fpath, verbose=False):
-	"""
-	get file contents in directory
-	Parameters
-		fpath ; directory path
-		verbose : verbosity flag
-	"""
-	# dcument list
-	docComplete  = []
-	filePaths = getAllFiles(fpath)
-	# read files
-	for filePath in filePaths:
-		if verbose:
-			print("next file " + filePath)
-		with open(filePath, 'r') as contentFile:
-			content = contentFile.read()
-			docComplete.append(content)
-	return (docComplete, filePaths)
-def getOneFileContent(fpath):
-	"""
-	get one file contents
-	Parameters
-		fpath : file path
-	"""
-	with open(fpath, 'r') as contentFile:
-		docStr = contentFile.read()
-	return docStr
-def getFileLines(dirPath, delim=","):
-	"""
-	get lines from a file
-	Parameters
-		dirPath : file path
-		delim : delemeter
-	"""
-	lines = list()
-	for li in fileRecGen(dirPath, delim):
-		lines.append(li)
-	return lines
-def getFileSampleLines(dirPath, percen, delim=","):
-	"""
-	get sampled lines from a file
-	Parameters
-		dirPath : file path
-		percen : sampling percentage
-		delim : delemeter
-	"""
-	lines = list()
-	for li in fileRecGen(dirPath, delim):
-		if randint(0, 100) < percen:
-			lines.append(li)
-	return lines
-def getFileColumnAsString(dirPath, index, delim=","):
-	"""
-	get string column from a file
-	Parameters
-		dirPath : file path
-		index : index
-		delim : delemeter
-	"""
-	fields = list()
-	for rec in fileRecGen(dirPath, delim):
-		fields.append(rec[index])
-	#print(fields)
-	return fields
-def getFileColumnsAsString(dirPath, indexes, delim=","):
-	"""
-	get multiple string columns from a file
-	Parameters
-		dirPath : file path
-		indexes : indexes of columns
-		delim : delemeter
-	"""
-	nindex = len(indexes)
-	columns = list(map(lambda i : list(), range(nindex)))
-	for rec in fileRecGen(dirPath, delim):
-		for i in range(nindex):
-			columns[i].append(rec[indexes[i]])
-	return columns
-def getFileColumnAsFloat(dirPath, index, delim=","):
-	"""
-	get float fileds from a file
-	Parameters
-		dirPath : file path
-		index : index
-		delim : delemeter
-	"""
-	#print("{}  {}".format(dirPath, index))
-	fields = getFileColumnAsString(dirPath, index, delim)
-	return list(map(lambda v:float(v), fields))
-def getFileColumnAsInt(dirPath, index, delim=","):
-	"""
-	get float fileds from a file
-	Parameters
-		dirPath : file path
-		index : index
-		delim : delemeter
-	"""
-	fields = getFileColumnAsString(dirPath, index, delim)
-	return list(map(lambda v:int(v), fields))
-def getFileAsIntMatrix(dirPath, columns, delim=","):
-	"""
-	extracts int matrix from csv file given column indices with each row being  concatenation of
-	extracted column values row size = num of columns
-	Parameters
-		dirPath : file path
-		columns : indexes of columns
-		delim : delemeter
-	"""
-	mat = list()
-	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
-		mat.append(asIntList(rec))
-	return mat
-def getFileAsFloatMatrix(dirPath, columns, delim=","):
-	"""
-	extracts float matrix from csv file given column indices with each row being concatenation of
-	extracted column values row size = num of columns
-	Parameters
-		dirPath : file path
-		columns : indexes of columns
-		delim : delemeter
-	"""
-	mat = list()
-	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
-		mat.append(asFloatList(rec))
-	return mat
-def getFileAsFloatColumn(dirPath):
-	"""
-	grt float list from a file with one float per row
-	Parameters
-		dirPath : file path
-	"""
-	flist = list()
-	for rec in fileRecGen(dirPath, None):
-		flist.append(float(rec))
-	return flist
-def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=","):
-	"""
-	extracts float matrix from csv file given row filter and column indices with each row being
-	concatenation of  extracted column values row size = num of columns
-	Parameters
-		dirPath : file path
-		columns : indexes of columns
-		filt : row filter lambda
-		delim : delemeter
-	"""
-	mat = list()
-	for rec in  fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):
-		mat.append(asFloatList(rec))
-	return mat
-def getFileAsTypedRecords(dirPath, types, delim=","):
-	"""
-	extracts typed records from csv file with each row being concatenation of
-	extracted column values
-	Parameters
-		dirPath : file path
-		types : data types
-		delim : delemeter
-	"""
-	(dtypes, cvalues) = extractTypesFromString(types)
-	tdata = list()
-	for rec in  fileRecGen(dirPath, delim):
-		trec = list()
-		for index, value in enumerate(rec):
-			value = __convToTyped(index, value, dtypes)
-			trec.append(value)
-		tdata.append(trec)
-	return tdata
-def getFileColsAsTypedRecords(dirPath, columns, types, delim=","):
-	"""
-	extracts typed records from csv file given column indices with each row being concatenation of
-	extracted column values
-	Parameters
-	Parameters
-		dirPath : file path
-		columns : column indexes
-		types : data types
-		delim : delemeter
-	"""
-	(dtypes, cvalues) = extractTypesFromString(types)
-	tdata = list()
-	for rec in  fileSelFieldsRecGen(dirPath, columns, delim):
-		trec = list()
-		for indx, value in enumerate(rec):
-			tindx = columns[indx]
-			value = __convToTyped(tindx, value, dtypes)
-			trec.append(value)
-		tdata.append(trec)
-	return tdata
-def getFileColumnsMinMax(dirPath, columns, dtype, delim=","):
-	"""
-	extracts numeric matrix from csv file given column indices. For each column return min and max
-	Parameters
-		dirPath : file path
-		columns : column indexes
-		dtype : data type
-		delim : delemeter
-	"""
-	dtypes = list(map(lambda c : str(c) + ":" + dtype, columns))
-	dtypes = ",".join(dtypes)
-	#print(dtypes)
-	tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)
-	minMax = list()
-	ncola = len(tdata[0])
-	ncole = len(columns)
-	assertEqual(ncola, ncole, "actual no of columns different from expected")
-	for ci in range(ncole):
-		vmin = sys.float_info.max
-		vmax = sys.float_info.min
-		for r in tdata:
-			cv = r[ci]
-			vmin = cv if cv < vmin else vmin
-			vmax = cv if cv > vmax else vmax
-		mm = (vmin, vmax, vmax - vmin)
-		minMax.append(mm)
-	return minMax
-def getRecAsTypedRecord(rec, types, delim=None):
-	"""
-	converts record to  typed records
-	Parameters
-		rec : delemeter separate string or list of string
-		types : field  data types
-		delim : delemeter
-	"""
-	if delim is not None:
-		rec = rec.split(delim)
-	(dtypes, cvalues) = extractTypesFromString(types)
-	#print(types)
-	#print(dtypes)
-	trec = list()
-	for ind, value in enumerate(rec):
-		tvalue = __convToTyped(ind, value, dtypes)
-		trec.append(tvalue)
-	return trec
-def __convToTyped(index, value, dtypes):
-	"""
-	convert to typed value
-	Parameters
-		index : index in type list
-		value : data value
-		dtypes : data type list
-	"""
-	#print(index, value)
-	dtype = dtypes[index]
-	tvalue = value
-	if dtype == "int":
-		tvalue = int(value)
-	elif dtype == "float":
-		tvalue = float(value)
-	return tvalue
-def extractTypesFromString(types):
-	"""
-	extracts column data types and set values for categorical variables
-	Parameters
-		types : encoded type information
-	"""
-	ftypes = types.split(",")
-	dtypes = dict()
-	cvalues = dict()
-	for ftype in ftypes:
-		items = ftype.split(":")
-		cindex = int(items[0])
-		dtype = items[1]
-		dtypes[cindex] = dtype
-		if len(items) == 3:
-			sitems = items[2].split()
-			cvalues[cindex] = sitems
-	return (dtypes, cvalues)
-def getMultipleFileAsInttMatrix(dirPathWithCol,  delim=","):
-	"""
-	extracts int matrix from from csv files given column index for each file.
-	num of columns  = number of rows in each file and num of rows = number of files
-	Parameters
-		dirPathWithCol: list of file path and collumn index pair
-		delim : delemeter
-	"""
-	mat = list()
-	minLen = -1
-	for path, col in dirPathWithCol:
-		colVals = getFileColumnAsInt(path, col, delim)
-		if minLen < 0 or len(colVals) < minLen:
-			minLen = len(colVals)
-		mat.append(colVals)
-	#make all same length
-	mat = list(map(lambda li:li[:minLen], mat))
-	return mat
-def getMultipleFileAsFloatMatrix(dirPathWithCol,  delim=","):
-	"""
-	extracts float matrix from from csv files given column index for each file.
-	num of columns  = number of rows in each file and num of rows = number of files
-	Parameters
-		dirPathWithCol: list of file path and collumn index pair
-		delim : delemeter
-	"""
-	mat = list()
-	minLen = -1
-	for path, col in dirPathWithCol:
-		colVals = getFileColumnAsFloat(path, col, delim)
-		if minLen < 0 or len(colVals) < minLen:
-			minLen = len(colVals)
-		mat.append(colVals)
-	#make all same length
-	mat = list(map(lambda li:li[:minLen], mat))
-	return mat
-def writeStrListToFile(ldata, filePath, delem=","):
-	"""
-	writes list of dlem separated string or list of list of string to afile
-	Parameters
-		ldata : list data
-		filePath : file path
-		delim : delemeter
-	"""
-	with open(filePath, "w") as fh:
-		for r in ldata:
-			if type(r) == list:
-				r = delem.join(r)
-			fh.write(r + "\n")
-def writeFloatListToFile(ldata, prec, filePath):
-	"""
-	writes float list to file, one value per line
-	Parameters
-		ldata : list data
-		prec : precision
-		filePath : file path
-	"""
-	with open(filePath, "w") as fh:
-		for d in ldata:
-			fh.write(formatFloat(prec, d) + "\n")
-def mutateFileLines(dirPath, mutator, marg, delim=","):
-	"""
-	mutates lines from a file
-	Parameters
-		dirPath : file path
-		mutator : mutation callback
-		marg : argument for mutation call back
-		delim : delemeter
-	"""
-	lines = list()
-	for li in fileRecGen(dirPath, delim):
-		li = mutator(li) if marg is None else mutator(li, marg)
-		lines.append(li)
-	return lines
-def takeFirst(elems):
-	"""
-	return fisrt item
-	Parameters
-		elems : list of data
-	"""
-	return elems[0]
-def takeSecond(elems):
-	"""
-	return 2nd element
-	Parameters
-		elems : list of data
-	"""
-	return elems[1]
-def takeThird(elems):
-	"""
-	returns 3rd element
-	Parameters
-		elems : list of data
-	"""
-	return elems[2]
-def addToKeyedCounter(dCounter, key, count=1):
-	"""
-	add to to keyed counter
-	Parameters
-		dCounter : dictionary of counters
-		key : dictionary key
-		count : count to add
-	"""
-	curCount = dCounter.get(key, 0)
-	dCounter[key] = curCount + count
-def incrKeyedCounter(dCounter, key):
-	"""
-	increment keyed counter
-	Parameters
-		dCounter : dictionary of counters
-		key : dictionary key
-	"""
-	addToKeyedCounter(dCounter, key, 1)
-def appendKeyedList(dList, key, elem):
-	"""
-	keyed list
-	Parameters
-		dList : dictionary of lists
-		key : dictionary key
-		elem : value to append
-	"""
-	curList = dList.get(key, [])
-	curList.append(elem)
-	dList[key] = curList
-def isNumber(st):
-	"""
-	Returns True is string is a number
-	Parameters
-		st : string value
-	"""
-	return st.replace('.','',1).isdigit()
-def removeNan(values):
-	"""
-	removes nan from list
-	Parameters
-		values : list data
-	"""
-	return list(filter(lambda v: not math.isnan(v), values))
-def fileRecGen(filePath, delim = ","):
-	"""
-	file record generator
-	Parameters
-		filePath ; file path
-		delim : delemeter
-	"""
-	with open(filePath, "r") as fp:
-		for line in fp:
-			line = line[:-1]
-			if delim is not None:
-				line = line.split(delim)
-			yield line
-def fileSelFieldsRecGen(dirPath, columns, delim=","):
-	"""
-	file record generator given column indices
-	Parameters
-		filePath ; file path
-		columns : column indexes as int array or coma separated string
-		delim : delemeter
-	"""
-	if type(columns) == str:
-		columns = strToIntArray(columns, delim)
-	for rec in fileRecGen(dirPath, delim):
-		extracted = extractList(rec, columns)
-		yield extracted
-def fileSelFieldValueGen(dirPath, column, delim=","):
-	"""
-	file record generator for a given column
-	Parameters
-		filePath ; file path
-		column : column index
-		delim : delemeter
-	"""
-	for rec in fileRecGen(dirPath, delim):
-		yield rec[column]
-def fileFiltRecGen(filePath, filt, delim = ","):
-	"""
-	file record generator with  row filter applied
-	Parameters
-		filePath ; file path
-		filt : row filter
-		delim : delemeter
-	"""
-	with open(filePath, "r") as fp:
-		for line in fp:
-			line = line[:-1]
-			if delim is not None:
-				line = line.split(delim)
-			if filt(line):
-				yield line
-def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = ","):
-	"""
-	file record generator with  row and column filter applied
-	Parameters
-		filePath ; file path
-		filt : row filter
-		columns : column indexes as int array or coma separated string
-		delim : delemeter
-	"""
-	columns = strToIntArray(columns, delim)
-	with open(filePath, "r") as fp:
-		for line in fp:
-			line = line[:-1]
-			if delim is not None:
-				line = line.split(delim)
-			if filt(line):
-				selected = extractList(line, columns)
-				yield selected
-def fileTypedRecGen(filePath, ftypes, delim = ","):
-	"""
-	file typed record generator
-	Parameters
-		filePath ; file path
-		ftypes : list of field types
-		delim : delemeter
-	"""
-	with open(filePath, "r") as fp:
-		for line in fp:
-			line = line[:-1]
-			line = line.split(delim)
-			for i in range(0, len(ftypes), 2):
-				ci = ftypes[i]
-				dtype = ftypes[i+1]
-				assertLesser(ci, len(line), "index out of bound")
-				if dtype == "int":
-					line[ci] = int(line[ci])
-				elif dtype == "float":
-					line[ci] = float(line[ci])
-				else:
-					exitWithMsg("invalid data type")
-			yield line
-def fileMutatedFieldsRecGen(dirPath, mutator, delim=","):
-	"""
-	file record generator with some columns mutated
-	Parameters
-		dirPath ; file path
-		mutator : row field mutator
-		delim : delemeter
-	"""
-	for rec in fileRecGen(dirPath, delim):
-		mutated = mutator(rec)
-		yield mutated
-def tableSelFieldsFilter(tdata, columns):
-	"""
-	gets tabular data for selected columns
-	Parameters
-		tdata : tabular data
-		columns : column indexes
-	"""
-	if areAllFieldsIncluded(tdata[0], columns):
-		ntdata = tdata
-	else:
-		ntdata = list()
-		for rec in tdata:
-			#print(rec)
-			#print(columns)
-			nrec = extractList(rec, columns)
-			ntdata.append(nrec)
-	return ntdata
-def areAllFieldsIncluded(ldata, columns):
-	"""
-	return True id all indexes are in the columns
-	Parameters
-		ldata : list data
-		columns : column indexes
-	"""
-	return list(range(len(ldata))) == columns
-def asIntList(items):
-	"""
-	returns int list
-	Parameters
-		items : list data
-	"""
-	return [int(i) for i in items]
-def asFloatList(items):
-	"""
-	returns float list
-	Parameters
-		items : list data
-	"""
-	return [float(i) for i in items]
-def pastTime(interval, unit):
-	"""
-	current and past time
-	Parameters
-		interval : time interval
-		unit: time unit
-	"""
-	curTime = int(time.time())
-	if unit == "d":
-		pastTime = curTime - interval * secInDay
-	elif unit == "h":
-		pastTime = curTime - interval * secInHour
-	elif unit == "m":
-		pastTime = curTime - interval * secInMinute
-	else:
-		raise ValueError("invalid time unit " + unit)
-	return (curTime, pastTime)
-def minuteAlign(ts):
-	"""
-	minute aligned time
-	Parameters
-		ts : time stamp in sec
-	"""
-	return int((ts / secInMinute)) * secInMinute
-def multMinuteAlign(ts, min):
-	"""
-	multi minute aligned time
-	Parameters
-		ts : time stamp in sec
-		min : minute value
-	"""
-	intv = secInMinute * min
-	return int((ts / intv)) * intv
-def hourAlign(ts):
-	"""
-	hour aligned time
-	Parameters
-		ts : time stamp in sec
-	"""
-	return int((ts / secInHour)) * secInHour
-def hourOfDayAlign(ts, hour):
-	"""
-	hour of day aligned time
-	Parameters
-		ts : time stamp in sec
-		hour : hour of day
-	"""
-	day = int(ts / secInDay)
-	return (24 * day + hour) * secInHour
-def dayAlign(ts):
-	"""
-	day aligned time
-	Parameters
-		ts : time stamp in sec
-	"""
-	return int(ts / secInDay) * secInDay
-def timeAlign(ts, unit):
-	"""
-	boundary alignment of time
-	Parameters
-		ts : time stamp in sec
-		unit : unit of time
-	"""
-	alignedTs = 0
-	if unit == "s":
-		alignedTs = ts
-	elif unit == "m":
-		alignedTs = minuteAlign(ts)
-	elif unit == "h":
-		alignedTs = hourAlign(ts)
-	elif unit == "d":
-		alignedTs = dayAlign(ts)
-	else:
-		raise ValueError("invalid time unit")
-	return 	alignedTs
-def monthOfYear(ts):
-	"""
-	month of year
-	Parameters
-		ts : time stamp in sec
-	"""
-	rem = ts % secInYear
-	dow = int(rem / secInMonth)
-	return dow
-def dayOfWeek(ts):
-	"""
-	day of week
-	Parameters
-		ts : time stamp in sec
-	"""
-	rem = ts % secInWeek
-	dow = int(rem / secInDay)
-	return dow
-def hourOfDay(ts):
-	"""
-	hour of day
-	Parameters
-		ts : time stamp in sec
-	"""
-	rem = ts % secInDay
-	hod = int(rem / secInHour)
-	return hod
-def processCmdLineArgs(expectedTypes, usage):
-	"""
-	process command line args and returns args as typed values
-	Parameters
-		expectedTypes : expected data types of arguments
-		usage : usage message string
-	"""
-	args = []
-	numComLineArgs = len(sys.argv)
-	numExpected = len(expectedTypes)
-	if (numComLineArgs - 1 == len(expectedTypes)):
-		try:
-			for i in range(0, numExpected):
-				if (expectedTypes[i] == typeInt):
-					args.append(int(sys.argv[i+1]))
-				elif (expectedTypes[i] == typeFloat):
-					args.append(float(sys.argv[i+1]))
-				elif (expectedTypes[i] == typeString):
-					args.append(sys.argv[i+1])
-		except ValueError:
-			print ("expected number of command line arguments found but there is type mis match")
-			sys.exit(1)
-	else:
-		print ("expected number of command line arguments not found")
-		print (usage)
-		sys.exit(1)
-	return args
-def mutateString(val, numMutate, ctype):
-	"""
-	mutate string multiple times
-	Parameters
-		val : string value
-		numMutate : num of mutations
-		ctype : type of character to mutate with
-	"""
-	mutations = set()
-	count = 0
-	while count < numMutate:
-		j = randint(0, len(val)-1)
-		if j not in mutations:
-			if ctype == "alpha":
-				ch = selectRandomFromList(alphaTokens)
-			elif ctype == "num":
-				ch = selectRandomFromList(numTokens)
-			elif ctype == "any":
-				ch = selectRandomFromList(tokens)
-			val = val[:j] + ch + val[j+1:]
-			mutations.add(j)
-			count += 1
-	return val
-def mutateList(values, numMutate, vmin, vmax, rabs=True):
-	"""
-	mutate list multiple times
-	Parameters
-		values : list value
-		numMutate : num of mutations
-		vmin : minimum of value range
-		vmax : maximum of value range
-		rabs : True if mim max range is absolute otherwise relative
-	"""
-	mutations = set()
-	count = 0
-	while count < numMutate:
-		j = randint(0, len(values)-1)
-		if j not in mutations:
-			s = np.random.uniform(vmin, vmax)
-			values[j] = s if rabs else  values[j] * s
-			count += 1
-			mutations.add(j)
-	return values
-def swap(values, first, second):
-	"""
-	swap two elements
-	Parameters
-		values : list value
-		first : first swap position
-		second : second swap position
-	"""
-	t = values[first]
-	values[first] = values[second]
-	values[second] = t
-def swapBetweenLists(values1, values2):
-	"""
-	swap two elements between 2 lists
-	Parameters
-		values1 : first list of values
-		values2 : second list of values
-	"""
-	p1 = randint(0, len(values1)-1)
-	p2 = randint(0, len(values2)-1)
-	tmp = values1[p1]
-	values1[p1] = values2[p2]
-	values2[p2] = tmp
-def safeAppend(values, value):
-	"""
-	append only if not None
-	Parameters
-		values : list value
-		value : value to append
-	"""
-	if value is not None:
-		values.append(value)
-def getAllIndex(ldata, fldata):
-	"""
-	get ALL indexes of list elements
-	Parameters
-		ldata : list data to find index in
-		fldata : list data for values for index look up
-	"""
-	return list(map(lambda e : fldata.index(e), ldata))
-def findIntersection(lOne, lTwo):
-	"""
-	find intersection elements between 2 lists
-	Parameters
-		lOne : first list of data
-		lTwo : second list of data
-	"""
-	sOne = set(lOne)
-	sTwo = set(lTwo)
-	sInt = sOne.intersection(sTwo)
-	return list(sInt)
-def isIntvOverlapped(rOne, rTwo):
-	"""
-	checks overlap between 2 intervals
-	Parameters
-		rOne : first interval boundaries
-		rTwo : second interval boundaries
-	"""
-	clear = rOne[1] <=  rTwo[0] or rOne[0] >=  rTwo[1]
-	return not clear
-def isIntvLess(rOne, rTwo):
-	"""
-	checks if first iterval is less than second
-	Parameters
-		rOne : first interval boundaries
-		rTwo : second interval boundaries
-	"""
-	less = rOne[1] <=  rTwo[0]
-	return less
-def findRank(e, values):
-	"""
-	find rank of value in a list
-	Parameters
-		e : value to compare with
-		values : list data
-	"""
-	count =  1
-	for ve in values:
-		if ve < e:
-			count += 1
-	return count
-def findRanks(toBeRanked, values):
-	"""
-	find ranks of values in one list in another list
-	Parameters
-		toBeRanked : list of values for which ranks are found
-		values : list in which rank is found :
-	"""
-	return list(map(lambda e: findRank(e, values), toBeRanked))
-def formatFloat(prec, value, label = None):
-	"""
-	formats a float with optional label
-	Parameters
-		prec : precision
-		value : data value
-		label : label for data
-	"""
-	st = (label + " ") if label else ""
-	formatter = "{:." + str(prec) + "f}"
-	return st + formatter.format(value)
-def formatAny(value, label = None):
-	"""
-	formats any obkect with optional label
-	Parameters
-		value : data value
-		label : label for data
-	"""
-	st = (label + " ") if label else ""
-	return st + str(value)
-def printList(values):
-	"""
-	pretty print list
-	Parameters
-		values : list of values
-	"""
-	for v in values:
-		print(v)
-def printMap(values, klab, vlab, precision, offset=16):
-	"""
-	pretty print hash map
-	Parameters
-		values : dictionary of values
-		klab : label for key
-		vlab : label for value
-		precision : precision
-		offset : left justify offset
-	"""
-	print(klab.ljust(offset, " ") + vlab)
-	for k in values.keys():
-		v = values[k]
-		ks = toStr(k, precision).ljust(offset, " ")
-		vs = toStr(v, precision)
-		print(ks +  vs)
-def printPairList(values, lab1, lab2, precision, offset=16):
-	"""
-	pretty print list of pairs
-	Parameters
-		values : dictionary of values
-		lab1 : first label
-		lab2 : second label
-		precision : precision
-		offset : left justify offset
-	"""
-	print(lab1.ljust(offset, " ") + lab2)
-	for (v1, v2) in values:
-		sv1 = toStr(v1, precision).ljust(offset, " ")
-		sv2 = toStr(v2, precision)
-		print(sv1 + sv2)
-def createMap(*values):
-	"""
-	create disctionary with results
-	Parameters
-		values : sequence of key value pairs
-	"""
-	result = dict()
-	for i in range(0, len(values), 2):
-		result[values[i]] = values[i+1]
-	return result
-def getColMinMax(table, col):
-	"""
-	return min, max values of a column
-	Parameters
-		table : tabular data
-		col : column index
-	"""
-	vmin = None
-	vmax = None
-	for rec in table:
-		value = rec[col]
-		if vmin is None:
-			vmin = value
-			vmax = value
-		else:
-			if value < vmin:
-				vmin = value
-			elif value > vmax:
-				vmax = value
-	return (vmin, vmax, vmax - vmin)
-def createLogger(name, logFilePath, logLevName):
-	"""
-	creates logger
-	Parameters
-		name : logger name
-		logFilePath : log file path
-		logLevName : log level
-	"""
-	logger = logging.getLogger(name)
-	fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)
-	logLev = logLevName.lower()
-	if logLev == "debug":
-		logLevel = logging.DEBUG
-	elif logLev == "info":
-		logLevel = logging.INFO
-	elif logLev == "warning":
-		logLevel = logging.WARNING
-	elif logLev == "error":
-		logLevel = logging.ERROR
-	elif logLev == "critical":
-		logLevel = logging.CRITICAL
-	else:
-		raise ValueError("invalid log level name " + logLevelName)
-	fHandler.setLevel(logLevel)
-	fFormat = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-	fHandler.setFormatter(fFormat)
-	logger.addHandler(fHandler)
-	logger.setLevel(logLevel)
-	return logger
-@contextmanager
-def suppressStdout():
-	"""
-	suppress stdout
-	Parameters
-	"""
-	with open(os.devnull, "w") as devnull:
-		oldStdout = sys.stdout
-		sys.stdout = devnull
-		try:
-			yield
-		finally:
-			sys.stdout = oldStdout
-def exitWithMsg(msg):
-	"""
-	print message and exit
-	Parameters
-		msg : message
-	"""
-	print(msg + " -- quitting")
-	sys.exit(0)
-def drawLine(data, yscale=None):
-	"""
-	line plot
-	Parameters
-		data : list data
-		yscale : y axis scale
-	"""
-	plt.plot(data)
-	if yscale:
-		step = int(yscale / 10)
-		step = int(step / 10) * 10
-		plt.yticks(range(0, yscale, step))
-	plt.show()
-def drawPlot(x, y, xlabel, ylabel):
-	"""
-	line plot
-	Parameters
-		x : x values
-		y : y values
-		xlabel : x axis label
-		ylabel : y axis label
-	"""
-	if x is None:
-		x = list(range(len(y)))
-	plt.plot(x,y)
-	plt.xlabel(xlabel)
-	plt.ylabel(ylabel)
-	plt.show()
-def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):
-	"""
-	line plot of 2 lines
-	Parameters
-		x : x values
-		y1 : first y values
-		y2 : second y values
-		xlabel : x labbel
-		ylabel : y label
-		y1label : first plot label
-		y2label : second plot label
-	"""
-	plt.plot(x, y1, label = y1label)
-	plt.plot(x, y2, label = y2label)
-	plt.xlabel(xlabel)
-	plt.ylabel(ylabel)
-	plt.legend()
-	plt.show()
-def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):
-	"""
-	draw histogram
-	Parameters
-		ldata : list data
-		myTitle : title
-		myXlabel : x label
-		myYlabel : y label
-		nbins : num of bins
-	"""
-	plt.hist(ldata, bins=nbins, density=True)
-	plt.title(myTitle)
-	plt.xlabel(myXlabel)
-	plt.ylabel(myYlabel)
-	plt.show()
-def saveObject(obj, filePath):
-	"""
-	saves an object
-	Parameters
-		obj : object
-		filePath : file path for saved object
-	"""
-	with open(filePath, "wb") as outfile:
-		pickle.dump(obj,outfile)
-def restoreObject(filePath):
-	"""
-	restores an object
-	Parameters
-		filePath : file path to restore object from
-	"""
-	with open(filePath, "rb") as infile:
-		obj = pickle.load(infile)
-	return obj
-def isNumeric(data):
-	"""
-	true if all elements int or float
-	Parameters
-		data : numeric data list
-	"""
-	if type(data) == list or type(data) == np.ndarray:
-		col = pd.Series(data)
-	else:
-		col = data
-	return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64
-def isInteger(data):
-	"""
-	true if all elements int
-	Parameters
-		data : numeric data list
-	"""
-	if type(data) == list or type(data) == np.ndarray:
-		col = pd.Series(data)
-	else:
-		col = data
-	return col.dtype == np.int32 or col.dtype == np.int64
-def isFloat(data):
-	"""
-	true if all elements  float
-	Parameters
-		data : numeric data list
-	"""
-	if type(data) == list or type(data) == np.ndarray:
-		col = pd.Series(data)
-	else:
-		col = data
-	return col.dtype == np.float32 or col.dtype == np.float64
-def isBinary(data):
-	"""
-	true if all elements either 0 or 1
-	Parameters
-		data : binary data
-	"""
-	re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)
-	return (re is None)
-def isCategorical(data):
-	"""
-	true if all elements int or string
-	Parameters
-		data : data value
-	"""
-	re = next((d for d in data if not (type(d) == int or type(d) == str)), None)
-	return (re is None)
-def assertEqual(value, veq, msg):
-	"""
-	assert equal to
-	Parameters
-		value : value
-		veq : value to be equated with
-		msg : error msg
-	"""
-	assert value == veq , msg
-def assertGreater(value, vmin, msg):
-	"""
-	assert greater than
-	Parameters
-		value : value
-		vmin : minimum value
-		msg : error msg
-	"""
-	assert value > vmin , msg
-def assertGreaterEqual(value, vmin, msg):
-	"""
-	assert greater than
-	Parameters
-		value : value
-		vmin : minimum value
-		msg : error msg
-	"""
-	assert value >= vmin , msg
-def assertLesser(value, vmax, msg):
-	"""
-	assert less than
-	Parameters
-		value : value
-		vmax : maximum value
-		msg : error msg
-	"""
-	assert value < vmax , msg
-def assertLesserEqual(value, vmax, msg):
-	"""
-	assert less than
-	Parameters
-		value : value
-		vmax : maximum value
-		msg : error msg
-	"""
-	assert value <= vmax , msg
-def assertWithinRange(value, vmin, vmax, msg):
-	"""
-	assert within range
-	Parameters
-		value : value
-		vmin : minimum value
-		vmax : maximum value
-		msg : error msg
-	"""
-	assert value >= vmin and value <= vmax, msg
-def assertInList(value, values, msg):
-	"""
-	assert contains in a list
-	Parameters
-		value ; balue to check for inclusion
-		values : list data
-		msg : error msg
-	"""
-	assert value in values, msg
-def maxListDist(l1, l2):
-	"""
-	maximum list element difference between 2 lists
-	Parameters
-		l1 : first list data
-		l2 : second list data
-	"""
-	dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))
-	return dist
-def fileLineCount(fPath):
-	"""
-	number of lines ina file
-	Parameters
-		fPath : file path
-	"""
-	with open(fPath) as f:
-		for i, li in enumerate(f):
-			pass
-	return (i + 1)
-def getAlphaNumCharCount(sdata):
-	"""
-	number of alphabetic and numeric charcters in a string
-	Parameters
-		sdata : string data
-	"""
-	acount = 0
-	ncount = 0
-	scount = 0
-	ocount = 0
-	assertEqual(type(sdata), str, "input must be string")
-	for c in sdata:
-		if c.isnumeric():
-			ncount += 1
-		elif c.isalpha():
-			acount += 1
-		elif c.isspace():
-			scount += 1
-		else:
-			ocount += 1
-	r = (acount, ncount, ocount)
-	return r
-def genPowerSet(cvalues, incEmpty=False):
-	"""
-	generates power set i.e all possible subsets
-	Parameters
-		cvalues : list of categorical values
-		incEmpty : include empty set if True
-	"""
-	ps = list()
-	for cv in cvalues:
-		pse = list()
-		for s in ps:
-			sc = s.copy()
-			sc.add(cv)
-			#print(sc)
-			pse.append(sc)
-		ps.extend(pse)
-		es = set()
-		es.add(cv)
-		ps.append(es)
-		#print(es)
-	if incEmpty:
-		ps.append({})
-	return ps
-class StepFunction:
-	"""
-	step function
-	Parameters
-	"""
-	def __init__(self,  *values):
-		"""
-		initilizer
-		Parameters
-			values : list of tuples, wich each tuple containing 2 x values and corresponding y value
-		"""
-		self.points = values
-	def find(self, x):
-		"""
-		finds step function value
-		Parameters
-			x : x value
-		"""
-		found = False
-		y = 0
-		for p in self.points:
-			if (x >= p[0] and x < p[1]):
-				y = p[2]
-				found = True
-				break
-		if not found:
-			l = len(self.points)
-			if (x < self.points[0][0]):
-				y = self.points[0][2]
-			elif (x > self.points[l-1][1]):
-				y = self.points[l-1][2]
-		return y
-class DummyVarGenerator:
-	"""
-	dummy variable generator for categorical variable
-	"""
-	def __init__(self,  rowSize, catValues, trueVal, falseVal, delim=None):
-		"""
-		initilizer
-		Parameters
-			rowSize : row size
-			catValues : dictionary with field index as key and list of categorical values as value
-			trueVal : true value, typically "1"
-			falseval : false value , typically "0"
-			delim : field delemeter
-		"""
-		self.rowSize = rowSize
-		self.catValues = catValues
-		numCatVar = len(catValues)
-		colCount = 0
-		for v in self.catValues.values():
-			colCount += len(v)
-		self.newRowSize = rowSize - numCatVar + colCount
-		#print ("new row size {}".format(self.newRowSize))
-		self.trueVal = trueVal
-		self.falseVal = falseVal
-		self.delim = delim
-	def processRow(self, row):
-		"""
-		encodes categorical variables, returning as delemeter separate dstring or list
-		Parameters
-			row : row either delemeter separated string or list
-		"""
-		if self.delim is not None:
-			rowArr = row.split(self.delim)
-			msg = "row does not have expected number of columns found " + str(len(rowArr)) + " expected " + str(self.rowSize)
-			assert len(rowArr) == self.rowSize, msg
-		else:
-			rowArr = row
-		newRowArr = []
-		for i in range(len(rowArr)):
-			curVal = rowArr[i]
-			if (i in self.catValues):
-				values = self.catValues[i]
-				for val in values:
-					if val == curVal:
-						newVal = self.trueVal
-					else:
-						newVal = self.falseVal
-					newRowArr.append(newVal)
-			else:
-				newRowArr.append(curVal)
-		assert len(newRowArr) == self.newRowSize, "invalid new row size " + str(len(newRowArr)) + " expected " + str(self.newRowSize)
-		encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr
-		return encRow

matumizi/pyproject.toml DELETED Viewed

@@ -1,6 +0,0 @@
-[build-system]
-requires = [
-    "setuptools>=42",
-    "wheel"
-]
-build-backend = "setuptools.build_meta"

matumizi/requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-hurst==0.0.5
-jprops==2.0.2
-matplotlib==3.3.0
-numpy==1.18.5
-pandas==1.1.0
-python_Levenshtein==0.12.2
-scikit_learn==1.0.2
-scipy==1.5.2
-statsmodels==0.11.1

matumizi/resources/spdata.txt DELETED Viewed

@@ -1,12 +0,0 @@
-WMT,171,22030
-PFE,226,9818
-NFLX,138,48338
-AMD,211,19423
-TSLA,57,55317
-AMZN,72,9604
-META,121,24221
-QCOM,83,13180
-CSCO,137,5854
-MSFT,67,16717
-SBUX,140,12640
-AAPL,78,11578

matumizi/setup.cfg DELETED Viewed

@@ -1,18 +0,0 @@
-[metadata]
-name = matumizi
-version = 0.0.7
-author = Pranab Ghosh
-author_email = pkghosh99@gmail.com
-description = Data exploration alopng with various utilities for Data Science
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/pranab/whakapai/tree/master/matumizi
-classifiers =
-    Programming Language :: Python :: 3
-    License :: OSI Approved :: GNU General Public License v2 (GPLv2)
-    Operating System :: OS Independent
-[options]
-packages = find:
-python_requires = >=3.7
-include_package_data = True