Priyanka-Kumavat-At-TE's picture
Upload 7 files
e03eaf2
#!/usr/local/bin/python3
# avenir-python: Machine Learning
# Author: Pranab Ghosh
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
import sys
import random
import time
import math
import numpy as np
import statistics
from .util import *
"""
histogram class
"""
class Histogram:
def __init__(self, min, binWidth):
"""
initializer
Parameters
min : min x
binWidth : bin width
"""
self.xmin = min
self.binWidth = binWidth
self.normalized = False
@classmethod
def createInitialized(cls, xmin, binWidth, values):
"""
create histogram instance with min domain, bin width and values
Parameters
min : min x
binWidth : bin width
values : y values
"""
instance = cls(xmin, binWidth)
instance.xmax = xmin + binWidth * (len(values) - 1)
instance.ymin = 0
instance.bins = np.array(values)
instance.fmax = 0
for v in values:
if (v > instance.fmax):
instance.fmax = v
instance.ymin = 0.0
instance.ymax = instance.fmax
return instance
@classmethod
def createWithNumBins(cls, values, numBins=20):
"""
create histogram instance values and no of bins
Parameters
values : y values
numBins : no of bins
"""
xmin = min(values)
xmax = max(values)
binWidth = (xmax + .01 - (xmin - .01)) / numBins
instance = cls(xmin, binWidth)
instance.xmax = xmax
instance.numBin = numBins
instance.bins = np.zeros(instance.numBin)
for v in values:
instance.add(v)
return instance
@classmethod
def createUninitialized(cls, xmin, xmax, binWidth):
"""
create histogram instance with no y values using domain min , max and bin width
Parameters
min : min x
max : max x
binWidth : bin width
"""
instance = cls(xmin, binWidth)
instance.xmax = xmax
instance.numBin = (xmax - xmin) / binWidth + 1
instance.bins = np.zeros(instance.numBin)
return instance
def initialize(self):
"""
set y values to 0
"""
self.bins = np.zeros(self.numBin)
def add(self, value):
"""
adds a value to a bin
Parameters
value : value
"""
bin = int((value - self.xmin) / self.binWidth)
if (bin < 0 or bin > self.numBin - 1):
print (bin)
raise ValueError("outside histogram range")
self.bins[bin] += 1.0
def normalize(self):
"""
normalize bin counts
"""
if not self.normalized:
total = self.bins.sum()
self.bins = np.divide(self.bins, total)
self.normalized = True
def cumDistr(self):
"""
cumulative dists
"""
self.normalize()
self.cbins = np.cumsum(self.bins)
return self.cbins
def distr(self):
"""
distr
"""
self.normalize()
return self.bins
def percentile(self, percent):
"""
return value corresponding to a percentile
Parameters
percent : percentile value
"""
if self.cbins is None:
raise ValueError("cumulative distribution is not available")
for i,cuml in enumerate(self.cbins):
if percent > cuml:
value = (i * self.binWidth) - (self.binWidth / 2) + \
(percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1])
break
return value
def max(self):
"""
return max bin value
"""
return self.bins.max()
def value(self, x):
"""
return a bin value
Parameters
x : x value
"""
bin = int((x - self.xmin) / self.binWidth)
f = self.bins[bin]
return f
def bin(self, x):
"""
return a bin index
Parameters
x : x value
"""
return int((x - self.xmin) / self.binWidth)
def cumValue(self, x):
"""
return a cumulative bin value
Parameters
x : x value
"""
bin = int((x - self.xmin) / self.binWidth)
c = self.cbins[bin]
return c
def getMinMax(self):
"""
returns x min and x max
"""
return (self.xmin, self.xmax)
def boundedValue(self, x):
"""
return x bounde by min and max
Parameters
x : x value
"""
if x < self.xmin:
x = self.xmin
elif x > self.xmax:
x = self.xmax
return x
"""
categorical histogram class
"""
class CatHistogram:
def __init__(self):
"""
initializer
"""
self.binCounts = dict()
self.counts = 0
self.normalized = False
def add(self, value):
"""
adds a value to a bin
Parameters
x : x value
"""
addToKeyedCounter(self.binCounts, value)
self.counts += 1
def normalize(self):
"""
normalize
"""
if not self.normalized:
self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))
self.normalized = True
def getMode(self):
"""
get mode
"""
maxk = None
maxv = 0
#print(self.binCounts)
for k,v in self.binCounts.items():
if v > maxv:
maxk = k
maxv = v
return (maxk, maxv)
def getEntropy(self):
"""
get entropy
"""
self.normalize()
entr = 0
#print(self.binCounts)
for k,v in self.binCounts.items():
entr -= v * math.log(v)
return entr
def getUniqueValues(self):
"""
get unique values
"""
return list(self.binCounts.keys())
def getDistr(self):
"""
get distribution
"""
self.normalize()
return self.binCounts.copy()
class RunningStat:
"""
running stat class
"""
def __init__(self):
"""
initializer
"""
self.sum = 0.0
self.sumSq = 0.0
self.count = 0
@staticmethod
def create(count, sum, sumSq):
"""
creates iinstance
Parameters
sum : sum of values
sumSq : sum of valure squared
"""
rs = RunningStat()
rs.sum = sum
rs.sumSq = sumSq
rs.count = count
return rs
def add(self, value):
"""
adds new value
Parameters
value : value to add
"""
self.sum += value
self.sumSq += (value * value)
self.count += 1
def getStat(self):
"""
return mean and std deviation
"""
mean = self.sum /self. count
t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
sd = math.sqrt(t)
re = (mean, sd)
return re
def addGetStat(self,value):
"""
calculate mean and std deviation with new value added
Parameters
value : value to add
"""
self.add(value)
re = self.getStat()
return re
def getCount(self):
"""
return count
"""
return self.count
def getState(self):
"""
return state
"""
s = (self.count, self.sum, self.sumSq)
return s
class SlidingWindowStat:
"""
sliding window stats
"""
def __init__(self):
"""
initializer
"""
self.sum = 0.0
self.sumSq = 0.0
self.count = 0
self.values = None
@staticmethod
def create(values, sum, sumSq):
"""
creates iinstance
Parameters
sum : sum of values
sumSq : sum of valure squared
"""
sws = SlidingWindowStat()
sws.sum = sum
sws.sumSq = sumSq
self.values = values.copy()
sws.count = len(self.values)
return sws
@staticmethod
def initialize(values):
"""
creates iinstance
Parameters
values : list of values
"""
sws = SlidingWindowStat()
sws.values = values.copy()
for v in sws.values:
sws.sum += v
sws.sumSq += v * v
sws.count = len(sws.values)
return sws
@staticmethod
def createEmpty(count):
"""
creates iinstance
Parameters
count : count of values
"""
sws = SlidingWindowStat()
sws.count = count
sws.values = list()
return sws
def add(self, value):
"""
adds new value
Parameters
value : value to add
"""
self.values.append(value)
if len(self.values) > self.count:
self.sum += value - self.values[0]
self.sumSq += (value * value) - (self.values[0] * self.values[0])
self.values.pop(0)
else:
self.sum += value
self.sumSq += (value * value)
def getStat(self):
"""
calculate mean and std deviation
"""
mean = self.sum /self. count
t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)
sd = math.sqrt(t)
re = (mean, sd)
return re
def addGetStat(self,value):
"""
calculate mean and std deviation with new value added
"""
self.add(value)
re = self.getStat()
return re
def getCount(self):
"""
return count
"""
return self.count
def getCurSize(self):
"""
return count
"""
return len(self.values)
def getState(self):
"""
return state
"""
s = (self.count, self.sum, self.sumSq)
return s
def basicStat(ldata):
"""
mean and std dev
Parameters
ldata : list of values
"""
m = statistics.mean(ldata)
s = statistics.stdev(ldata, xbar=m)
r = (m, s)
return r
def getFileColumnStat(filePath, col, delem=","):
"""
gets stats for a file column
Parameters
filePath : file path
col : col index
delem : field delemter
"""
rs = RunningStat()
for rec in fileRecGen(filePath, delem):
va = float(rec[col])
rs.add(va)
return rs.getStat()