Spaces:
Runtime error
Runtime error
#!/usr/local/bin/python3 | |
# avenir-python: Machine Learning | |
# Author: Pranab Ghosh | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you | |
# may not use this file except in compliance with the License. You may | |
# obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
import sys | |
import random | |
import time | |
import math | |
import numpy as np | |
import statistics | |
from .util import * | |
""" | |
histogram class | |
""" | |
class Histogram: | |
def __init__(self, min, binWidth): | |
""" | |
initializer | |
Parameters | |
min : min x | |
binWidth : bin width | |
""" | |
self.xmin = min | |
self.binWidth = binWidth | |
self.normalized = False | |
def createInitialized(cls, xmin, binWidth, values): | |
""" | |
create histogram instance with min domain, bin width and values | |
Parameters | |
min : min x | |
binWidth : bin width | |
values : y values | |
""" | |
instance = cls(xmin, binWidth) | |
instance.xmax = xmin + binWidth * (len(values) - 1) | |
instance.ymin = 0 | |
instance.bins = np.array(values) | |
instance.fmax = 0 | |
for v in values: | |
if (v > instance.fmax): | |
instance.fmax = v | |
instance.ymin = 0.0 | |
instance.ymax = instance.fmax | |
return instance | |
def createWithNumBins(cls, values, numBins=20): | |
""" | |
create histogram instance values and no of bins | |
Parameters | |
values : y values | |
numBins : no of bins | |
""" | |
xmin = min(values) | |
xmax = max(values) | |
binWidth = (xmax + .01 - (xmin - .01)) / numBins | |
instance = cls(xmin, binWidth) | |
instance.xmax = xmax | |
instance.numBin = numBins | |
instance.bins = np.zeros(instance.numBin) | |
for v in values: | |
instance.add(v) | |
return instance | |
def createUninitialized(cls, xmin, xmax, binWidth): | |
""" | |
create histogram instance with no y values using domain min , max and bin width | |
Parameters | |
min : min x | |
max : max x | |
binWidth : bin width | |
""" | |
instance = cls(xmin, binWidth) | |
instance.xmax = xmax | |
instance.numBin = (xmax - xmin) / binWidth + 1 | |
instance.bins = np.zeros(instance.numBin) | |
return instance | |
def initialize(self): | |
""" | |
set y values to 0 | |
""" | |
self.bins = np.zeros(self.numBin) | |
def add(self, value): | |
""" | |
adds a value to a bin | |
Parameters | |
value : value | |
""" | |
bin = int((value - self.xmin) / self.binWidth) | |
if (bin < 0 or bin > self.numBin - 1): | |
print (bin) | |
raise ValueError("outside histogram range") | |
self.bins[bin] += 1.0 | |
def normalize(self): | |
""" | |
normalize bin counts | |
""" | |
if not self.normalized: | |
total = self.bins.sum() | |
self.bins = np.divide(self.bins, total) | |
self.normalized = True | |
def cumDistr(self): | |
""" | |
cumulative dists | |
""" | |
self.normalize() | |
self.cbins = np.cumsum(self.bins) | |
return self.cbins | |
def distr(self): | |
""" | |
distr | |
""" | |
self.normalize() | |
return self.bins | |
def percentile(self, percent): | |
""" | |
return value corresponding to a percentile | |
Parameters | |
percent : percentile value | |
""" | |
if self.cbins is None: | |
raise ValueError("cumulative distribution is not available") | |
for i,cuml in enumerate(self.cbins): | |
if percent > cuml: | |
value = (i * self.binWidth) - (self.binWidth / 2) + \ | |
(percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) | |
break | |
return value | |
def max(self): | |
""" | |
return max bin value | |
""" | |
return self.bins.max() | |
def value(self, x): | |
""" | |
return a bin value | |
Parameters | |
x : x value | |
""" | |
bin = int((x - self.xmin) / self.binWidth) | |
f = self.bins[bin] | |
return f | |
def bin(self, x): | |
""" | |
return a bin index | |
Parameters | |
x : x value | |
""" | |
return int((x - self.xmin) / self.binWidth) | |
def cumValue(self, x): | |
""" | |
return a cumulative bin value | |
Parameters | |
x : x value | |
""" | |
bin = int((x - self.xmin) / self.binWidth) | |
c = self.cbins[bin] | |
return c | |
def getMinMax(self): | |
""" | |
returns x min and x max | |
""" | |
return (self.xmin, self.xmax) | |
def boundedValue(self, x): | |
""" | |
return x bounde by min and max | |
Parameters | |
x : x value | |
""" | |
if x < self.xmin: | |
x = self.xmin | |
elif x > self.xmax: | |
x = self.xmax | |
return x | |
""" | |
categorical histogram class | |
""" | |
class CatHistogram: | |
def __init__(self): | |
""" | |
initializer | |
""" | |
self.binCounts = dict() | |
self.counts = 0 | |
self.normalized = False | |
def add(self, value): | |
""" | |
adds a value to a bin | |
Parameters | |
x : x value | |
""" | |
addToKeyedCounter(self.binCounts, value) | |
self.counts += 1 | |
def normalize(self): | |
""" | |
normalize | |
""" | |
if not self.normalized: | |
self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items())) | |
self.normalized = True | |
def getMode(self): | |
""" | |
get mode | |
""" | |
maxk = None | |
maxv = 0 | |
#print(self.binCounts) | |
for k,v in self.binCounts.items(): | |
if v > maxv: | |
maxk = k | |
maxv = v | |
return (maxk, maxv) | |
def getEntropy(self): | |
""" | |
get entropy | |
""" | |
self.normalize() | |
entr = 0 | |
#print(self.binCounts) | |
for k,v in self.binCounts.items(): | |
entr -= v * math.log(v) | |
return entr | |
def getUniqueValues(self): | |
""" | |
get unique values | |
""" | |
return list(self.binCounts.keys()) | |
def getDistr(self): | |
""" | |
get distribution | |
""" | |
self.normalize() | |
return self.binCounts.copy() | |
class RunningStat: | |
""" | |
running stat class | |
""" | |
def __init__(self): | |
""" | |
initializer | |
""" | |
self.sum = 0.0 | |
self.sumSq = 0.0 | |
self.count = 0 | |
def create(count, sum, sumSq): | |
""" | |
creates iinstance | |
Parameters | |
sum : sum of values | |
sumSq : sum of valure squared | |
""" | |
rs = RunningStat() | |
rs.sum = sum | |
rs.sumSq = sumSq | |
rs.count = count | |
return rs | |
def add(self, value): | |
""" | |
adds new value | |
Parameters | |
value : value to add | |
""" | |
self.sum += value | |
self.sumSq += (value * value) | |
self.count += 1 | |
def getStat(self): | |
""" | |
return mean and std deviation | |
""" | |
mean = self.sum /self. count | |
t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1) | |
sd = math.sqrt(t) | |
re = (mean, sd) | |
return re | |
def addGetStat(self,value): | |
""" | |
calculate mean and std deviation with new value added | |
Parameters | |
value : value to add | |
""" | |
self.add(value) | |
re = self.getStat() | |
return re | |
def getCount(self): | |
""" | |
return count | |
""" | |
return self.count | |
def getState(self): | |
""" | |
return state | |
""" | |
s = (self.count, self.sum, self.sumSq) | |
return s | |
class SlidingWindowStat: | |
""" | |
sliding window stats | |
""" | |
def __init__(self): | |
""" | |
initializer | |
""" | |
self.sum = 0.0 | |
self.sumSq = 0.0 | |
self.count = 0 | |
self.values = None | |
def create(values, sum, sumSq): | |
""" | |
creates iinstance | |
Parameters | |
sum : sum of values | |
sumSq : sum of valure squared | |
""" | |
sws = SlidingWindowStat() | |
sws.sum = sum | |
sws.sumSq = sumSq | |
self.values = values.copy() | |
sws.count = len(self.values) | |
return sws | |
def initialize(values): | |
""" | |
creates iinstance | |
Parameters | |
values : list of values | |
""" | |
sws = SlidingWindowStat() | |
sws.values = values.copy() | |
for v in sws.values: | |
sws.sum += v | |
sws.sumSq += v * v | |
sws.count = len(sws.values) | |
return sws | |
def createEmpty(count): | |
""" | |
creates iinstance | |
Parameters | |
count : count of values | |
""" | |
sws = SlidingWindowStat() | |
sws.count = count | |
sws.values = list() | |
return sws | |
def add(self, value): | |
""" | |
adds new value | |
Parameters | |
value : value to add | |
""" | |
self.values.append(value) | |
if len(self.values) > self.count: | |
self.sum += value - self.values[0] | |
self.sumSq += (value * value) - (self.values[0] * self.values[0]) | |
self.values.pop(0) | |
else: | |
self.sum += value | |
self.sumSq += (value * value) | |
def getStat(self): | |
""" | |
calculate mean and std deviation | |
""" | |
mean = self.sum /self. count | |
t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1) | |
sd = math.sqrt(t) | |
re = (mean, sd) | |
return re | |
def addGetStat(self,value): | |
""" | |
calculate mean and std deviation with new value added | |
""" | |
self.add(value) | |
re = self.getStat() | |
return re | |
def getCount(self): | |
""" | |
return count | |
""" | |
return self.count | |
def getCurSize(self): | |
""" | |
return count | |
""" | |
return len(self.values) | |
def getState(self): | |
""" | |
return state | |
""" | |
s = (self.count, self.sum, self.sumSq) | |
return s | |
def basicStat(ldata): | |
""" | |
mean and std dev | |
Parameters | |
ldata : list of values | |
""" | |
m = statistics.mean(ldata) | |
s = statistics.stdev(ldata, xbar=m) | |
r = (m, s) | |
return r | |
def getFileColumnStat(filePath, col, delem=","): | |
""" | |
gets stats for a file column | |
Parameters | |
filePath : file path | |
col : col index | |
delem : field delemter | |
""" | |
rs = RunningStat() | |
for rec in fileRecGen(filePath, delem): | |
va = float(rec[col]) | |
rs.add(va) | |
return rs.getStat() | |