#!/usr/local/bin/python3 # Author: Pranab Ghosh # # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You may # obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. See the License for the specific language governing # permissions and limitations under the License. import os import sys from random import randint import random import time import uuid from datetime import datetime import math import numpy as np import pandas as pd import matplotlib.pyplot as plt import numpy as np import logging import logging.handlers import pickle from contextlib import contextmanager tokens = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F","G","H","I","J","K","L","M", "N","O","P","Q","R","S","T","U","V","W","X","Y","Z","0","1","2","3","4","5","6","7","8","9"] numTokens = tokens[:10] alphaTokens = tokens[10:36] loCaseChars = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k","l","m","n","o", "p","q","r","s","t","u","v","w","x","y","z"] typeInt = "int" typeFloat = "float" typeString = "string" secInMinute = 60 secInHour = 60 * 60 secInDay = 24 * secInHour secInWeek = 7 * secInDay secInYear = 365 * secInDay secInMonth = secInYear / 12 minInHour = 60 minInDay = 24 * minInHour ftPerYard = 3 ftPerMile = ftPerYard * 1760 def genID(size): """ generates ID Parameters size : size of ID """ id = "" for i in range(size): id = id + selectRandomFromList(tokens) return id def genIdList(numId, idSize): """ generate list of IDs Parameters: numId: number of Ids idSize: ID size """ iDs = [] for i in range(numId): iDs.append(genID(idSize)) return iDs def genNumID(size): """ generates ID consisting of digits onl Parameters size : size of ID """ id = "" for i in range(size): id = id + selectRandomFromList(numTokens) return id def genLowCaseID(size): """ generates ID consisting of lower case chars Parameters size : size of ID """ id = "" for i in range(size): id = id + selectRandomFromList(loCaseChars) return id def genNumIdList(numId, idSize): """ generate list of numeric IDs Parameters: numId: number of Ids idSize: ID size """ iDs = [] for i in range(numId): iDs.append(genNumID(idSize)) return iDs def genNameInitial(): """ generate name initial """ return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens) def genPhoneNum(arCode): """ generates phone number Parameters arCode: area code """ phNum = genNumID(7) return arCode + str(phNum) def selectRandomFromList(ldata): """ select an element randomly from a lis Parameters ldata : list data """ return ldata[randint(0, len(ldata)-1)] def selectOtherRandomFromList(ldata, cval): """ select an element randomly from a list excluding the given one Parameters ldata : list data cval : value to be excluded """ nval = selectRandomFromList(ldata) while nval == cval: nval = selectRandomFromList(ldata) return nval def selectRandomSubListFromList(ldata, num): """ generates random sublist from a list without replacemment Parameters ldata : list data num : output list size """ assertLesser(num, len(ldata), "size of sublist to be sampled greater than or equal to main list") i = randint(0, len(ldata)-1) sel = ldata[i] selSet = {i} selList = [sel] while (len(selSet) < num): i = randint(0, len(ldata)-1) if (i not in selSet): sel = ldata[i] selSet.add(i) selList.append(sel) return selList def selectRandomSubListFromListWithRepl(ldata, num): """ generates random sublist from a list with replacemment Parameters ldata : list data num : output list size """ return list(map(lambda i : selectRandomFromList(ldata), range(num))) def selectRandomFromDict(ddata): """ select an element randomly from a dictionary Parameters ddata : dictionary data """ dkeys = list(ddata.keys()) dk = selectRandomFromList(dkeys) el = (dk, ddata[dk]) return el def setListRandomFromList(ldata, ldataRepl): """ sets some elents in the first list randomly with elements from the second list Parameters ldata : list data ldataRepl : list with replacement data """ l = len(ldata) selSet = set() for d in ldataRepl: i = randint(0, l-1) while i in selSet: i = randint(0, l-1) ldata[i] = d selSet.add(i) def genIpAddress(): """ generates IP address """ i1 = randint(0,256) i2 = randint(0,256) i3 = randint(0,256) i4 = randint(0,256) ip = "%d.%d.%d.%d" %(i1,i2,i3,i4) return ip def curTimeMs(): """ current time in ms """ return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000) def secDegPolyFit(x1, y1, x2, y2, x3, y3): """ second deg polynomial Parameters x1 : 1st point x y1 : 1st point y x2 : 2nd point x y2 : 2nd point y x3 : 3rd point x y3 : 3rd point y """ t = (y1 - y2) / (x1 - x2) a = t - (y2 - y3) / (x2 - x3) a = a / (x1 - x3) b = t - a * (x1 + x2) c = y1 - a * x1 * x1 - b * x1 return (a, b, c) def range_limit(val, minv, maxv): """ range limit a value Parameters val : data value minv : minimum maxv : maximum """ if (val < minv): val = minv elif (val > maxv): val = maxv return val def rangeLimit(val, minv, maxv): """ range limit a value Parameters val : data value minv : minimum maxv : maximum """ return range_limit(val, minv, maxv) def isInRange(val, minv, maxv): """ checks if within range Parameters val : data value minv : minimum maxv : maximum """ return val >= minv and val <= maxv def stripFileLines(filePath, offset): """ strips number of chars from both ends Parameters filePath : file path offset : offset from both ends of line """ fp = open(filePath, "r") for line in fp: stripped = line[offset:len(line) - 1 - offset] print (stripped) fp.close() def genLatLong(lat1, long1, lat2, long2): """ generate lat log within limits Parameters lat1 : lat of 1st point long1 : long of 1st point lat2 : lat of 2nd point long2 : long of 2nd point """ lat = lat1 + (lat2 - lat1) * random.random() longg = long1 + (long2 - long1) * random.random() return (lat, longg) def geoDistance(lat1, long1, lat2, long2): """ find geo distance in ft Parameters lat1 : lat of 1st point long1 : long of 1st point lat2 : lat of 2nd point long2 : long of 2nd point """ latDiff = math.radians(lat1 - lat2) longDiff = math.radians(long1 - long2) l1 = math.sin(latDiff/2.0) l2 = math.sin(longDiff/2.0) l3 = math.cos(math.radians(lat1)) l4 = math.cos(math.radians(lat2)) a = l1 * l1 + l3 * l4 * l2 * l2 l5 = math.sqrt(a) l6 = math.sqrt(1.0 - a) c = 2.0 * math.atan2(l5, l6) r = 6371008.8 * 3.280840 return c * r def minLimit(val, limit): """ min limit Parameters """ if (val < limit): val = limit return val; def maxLimit(val, limit): """ max limit Parameters """ if (val > limit): val = limit return val; def rangeSample(val, minLim, maxLim): """ if out side range sample within range Parameters val : value minLim : minimum maxLim : maximum """ if val < minLim or val > maxLim: val = randint(minLim, maxLim) return val def genRandomIntListWithinRange(size, minLim, maxLim): """ random unique list of integers within range Parameters size : size of returned list minLim : minimum maxLim : maximum """ values = set() for i in range(size): val = randint(minLim, maxLim) while val not in values: values.add(val) return list(values) def preturbScalar(value, vrange, distr="uniform"): """ preturbs a mutiplicative value within range Parameters value : data value vrange : value delta fraction distr : noise distribution type """ if distr == "uniform": scale = 1.0 - vrange + 2 * vrange * random.random() elif distr == "normal": scale = 1.0 + np.random.normal(0, vrange) else: exisWithMsg("unknown noise distr " + distr) return value * scale def preturbScalarAbs(value, vrange): """ preturbs an absolute value within range Parameters value : data value vrange : value delta absolute """ delta = - vrange + 2.0 * vrange * random.random() return value + delta def preturbVector(values, vrange): """ preturbs a list within range Parameters values : list data vrange : value delta fraction """ nValues = list(map(lambda va: preturbScalar(va, vrange), values)) return nValues def randomShiftVector(values, smin, smax): """ shifts a list by a random quanity with a range Parameters values : list data smin : samplinf minimum smax : sampling maximum """ shift = np.random.uniform(smin, smax) return list(map(lambda va: va + shift, values)) def floatRange(beg, end, incr): """ generates float range Parameters beg :range begin end: range end incr : range increment """ return list(np.arange(beg, end, incr)) def shuffle(values, *numShuffles): """ in place shuffling with swap of pairs Parameters values : list data numShuffles : parameter list for number of shuffles """ size = len(values) if len(numShuffles) == 0: numShuffle = int(size / 2) elif len(numShuffles) == 1: numShuffle = numShuffles[0] else: numShuffle = randint(numShuffles[0], numShuffles[1]) print("numShuffle {}".format(numShuffle)) for i in range(numShuffle): first = random.randint(0, size - 1) second = random.randint(0, size - 1) while first == second: second = random.randint(0, size - 1) tmp = values[first] values[first] = values[second] values[second] = tmp def splitList(itms, numGr): """ splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen Parameters itms ; list of values numGr : no of groups """ tcount = len(itms) cItems = list(itms) sz = int(len(cItems) / numGr) groups = list() count = 0 for i in range(numGr): if (i == numGr - 1): csz = tcount - count else: csz = sz + randint(-2, 2) count += csz gr = list() for j in range(csz): it = selectRandomFromList(cItems) gr.append(it) cItems.remove(it) groups.append(gr) return groups def multVector(values, vrange): """ multiplies a list within value range Parameters values : list of values vrange : fraction of vaue to be used to update """ scale = 1.0 - vrange + 2 * vrange * random.random() nValues = list(map(lambda va: va * scale, values)) return nValues def weightedAverage(values, weights): """ calculates weighted average Parameters values : list of values weights : list of weights """ assert len(values) == len(weights), "values and weights should be same size" vw = zip(values, weights) wva = list(map(lambda e : e[0] * e[1], vw)) #wa = sum(x * y for x, y in vw) / sum(weights) wav = sum(wva) / sum(weights) return wav def extractFields(line, delim, keepIndices): """ breaks a line into fields and keeps only specified fileds and returns new line Parameters line ; deli separated string delim : delemeter keepIndices : list of indexes to fields to be retained """ items = line.split(delim) newLine = [] for i in keepIndices: newLine.append(line[i]) return delim.join(newLine) def remFields(line, delim, remIndices): """ removes fields from delim separated string Parameters line ; delemeter separated string delim : delemeter remIndices : list of indexes to fields to be removed """ items = line.split(delim) newLine = [] for i in range(len(items)): if not arrayContains(remIndices, i): newLine.append(line[i]) return delim.join(newLine) def extractList(data, indices): """ extracts list from another list, given indices Parameters remIndices : list data indices : list of indexes to fields to be retained """ if areAllFieldsIncluded(data, indices): exList = data.copy() #print("all indices") else: exList = list() le = len(data) for i in indices: assert i < le , "index {} out of bound {}".format(i, le) exList.append(data[i]) return exList def arrayContains(arr, item): """ checks if array contains an item Parameters arr : list data item : item to search """ contains = True try: arr.index(item) except ValueError: contains = False return contains def strToIntArray(line, delim=","): """ int array from delim separated string Parameters line ; delemeter separated string """ arr = line.split(delim) return [int(a) for a in arr] def strToFloatArray(line, delim=","): """ float array from delim separated string Parameters line ; delemeter separated string """ arr = line.split(delim) return [float(a) for a in arr] def strListOrRangeToIntArray(line): """ int array from delim separated string or range Parameters line ; delemeter separated string """ varr = line.split(",") if (len(varr) > 1): iarr = list(map(lambda v: int(v), varr)) else: vrange = line.split(":") if (len(vrange) == 2): lo = int(vrange[0]) hi = int(vrange[1]) iarr = list(range(lo, hi+1)) else: iarr = [int(line)] return iarr def toStr(val, precision): """ converts any type to string Parameters val : value precision ; precision for float value """ if type(val) == float or type(val) == np.float64 or type(val) == np.float32: format = "%" + ".%df" %(precision) sVal = format %(val) else: sVal = str(val) return sVal def toStrFromList(values, precision, delim=","): """ converts list of any type to delim separated string Parameters values : list data precision ; precision for float value delim : delemeter """ sValues = list(map(lambda v: toStr(v, precision), values)) return delim.join(sValues) def toIntList(values): """ convert to int list Parameters values : list data """ return list(map(lambda va: int(va), values)) def toFloatList(values): """ convert to float list Parameters values : list data """ return list(map(lambda va: float(va), values)) def toStrList(values, precision=None): """ convert to string list Parameters values : list data precision ; precision for float value """ return list(map(lambda va: toStr(va, precision), values)) def toIntFromBoolean(value): """ convert to int Parameters value : boolean value """ ival = 1 if value else 0 return ival def scaleBySum(ldata): """ scales so that sum is 1 Parameters ldata : list data """ s = sum(ldata) return list(map(lambda e : e/s, ldata)) def scaleByMax(ldata): """ scales so that max value is 1 Parameters ldata : list data """ m = max(ldata) return list(map(lambda e : e/m, ldata)) def typedValue(val, dtype=None): """ return typed value given string, discovers data type if not specified Parameters val : value dtype : data type """ tVal = None if dtype is not None: if dtype == "num": dtype = "int" if dtype.find(".") == -1 else "float" if dtype == "int": tVal = int(val) elif dtype == "float": tVal = float(val) elif dtype == "bool": tVal = bool(val) else: tVal = val else: if type(val) == str: lVal = val.lower() #int done = True try: tVal = int(val) except ValueError: done = False #float if not done: done = True try: tVal = float(val) except ValueError: done = False #boolean if not done: done = True if lVal == "true": tVal = True elif lVal == "false": tVal = False else: done = False #None if not done: if lVal == "none": tVal = None else: tVal = val else: tVal = val return tVal def isInt(val): """ return true if string is int and the typed value Parameters val : value """ valInt = True try: tVal = int(val) except ValueError: valInt = False tVal = None r = (valInt, tVal) return r def isFloat(val): """ return true if string is float Parameters val : value """ valFloat = True try: tVal = float(val) except ValueError: valFloat = False tVal = None r = (valFloat, tVal) return r def getAllFiles(dirPath): """ get all files recursively Parameters dirPath : directory path """ filePaths = [] for (thisDir, subDirs, fileNames) in os.walk(dirPath): for fileName in fileNames: filePaths.append(os.path.join(thisDir, fileName)) filePaths.sort() return filePaths def getFileContent(fpath, verbose=False): """ get file contents in directory Parameters fpath ; directory path verbose : verbosity flag """ # dcument list docComplete = [] filePaths = getAllFiles(fpath) # read files for filePath in filePaths: if verbose: print("next file " + filePath) with open(filePath, 'r') as contentFile: content = contentFile.read() docComplete.append(content) return (docComplete, filePaths) def getOneFileContent(fpath): """ get one file contents Parameters fpath : file path """ with open(fpath, 'r') as contentFile: docStr = contentFile.read() return docStr def getFileLines(dirPath, delim=","): """ get lines from a file Parameters dirPath : file path delim : delemeter """ lines = list() for li in fileRecGen(dirPath, delim): lines.append(li) return lines def getFileSampleLines(dirPath, percen, delim=","): """ get sampled lines from a file Parameters dirPath : file path percen : sampling percentage delim : delemeter """ lines = list() for li in fileRecGen(dirPath, delim): if randint(0, 100) < percen: lines.append(li) return lines def getFileColumnAsString(dirPath, index, delim=","): """ get string column from a file Parameters dirPath : file path index : index delim : delemeter """ fields = list() for rec in fileRecGen(dirPath, delim): fields.append(rec[index]) #print(fields) return fields def getFileColumnsAsString(dirPath, indexes, delim=","): """ get multiple string columns from a file Parameters dirPath : file path indexes : indexes of columns delim : delemeter """ nindex = len(indexes) columns = list(map(lambda i : list(), range(nindex))) for rec in fileRecGen(dirPath, delim): for i in range(nindex): columns[i].append(rec[indexes[i]]) return columns def getFileColumnAsFloat(dirPath, index, delim=","): """ get float fileds from a file Parameters dirPath : file path index : index delim : delemeter """ #print("{} {}".format(dirPath, index)) fields = getFileColumnAsString(dirPath, index, delim) return list(map(lambda v:float(v), fields)) def getFileColumnAsInt(dirPath, index, delim=","): """ get float fileds from a file Parameters dirPath : file path index : index delim : delemeter """ fields = getFileColumnAsString(dirPath, index, delim) return list(map(lambda v:int(v), fields)) def getFileAsIntMatrix(dirPath, columns, delim=","): """ extracts int matrix from csv file given column indices with each row being concatenation of extracted column values row size = num of columns Parameters dirPath : file path columns : indexes of columns delim : delemeter """ mat = list() for rec in fileSelFieldsRecGen(dirPath, columns, delim): mat.append(asIntList(rec)) return mat def getFileAsFloatMatrix(dirPath, columns, delim=","): """ extracts float matrix from csv file given column indices with each row being concatenation of extracted column values row size = num of columns Parameters dirPath : file path columns : indexes of columns delim : delemeter """ mat = list() for rec in fileSelFieldsRecGen(dirPath, columns, delim): mat.append(asFloatList(rec)) return mat def getFileAsFloatColumn(dirPath): """ grt float list from a file with one float per row Parameters dirPath : file path """ flist = list() for rec in fileRecGen(dirPath, None): flist.append(float(rec)) return flist def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=","): """ extracts float matrix from csv file given row filter and column indices with each row being concatenation of extracted column values row size = num of columns Parameters dirPath : file path columns : indexes of columns filt : row filter lambda delim : delemeter """ mat = list() for rec in fileFiltSelFieldsRecGen(dirPath, filt, columns, delim): mat.append(asFloatList(rec)) return mat def getFileAsTypedRecords(dirPath, types, delim=","): """ extracts typed records from csv file with each row being concatenation of extracted column values Parameters dirPath : file path types : data types delim : delemeter """ (dtypes, cvalues) = extractTypesFromString(types) tdata = list() for rec in fileRecGen(dirPath, delim): trec = list() for index, value in enumerate(rec): value = __convToTyped(index, value, dtypes) trec.append(value) tdata.append(trec) return tdata def getFileColsAsTypedRecords(dirPath, columns, types, delim=","): """ extracts typed records from csv file given column indices with each row being concatenation of extracted column values Parameters Parameters dirPath : file path columns : column indexes types : data types delim : delemeter """ (dtypes, cvalues) = extractTypesFromString(types) tdata = list() for rec in fileSelFieldsRecGen(dirPath, columns, delim): trec = list() for indx, value in enumerate(rec): tindx = columns[indx] value = __convToTyped(tindx, value, dtypes) trec.append(value) tdata.append(trec) return tdata def getFileColumnsMinMax(dirPath, columns, dtype, delim=","): """ extracts numeric matrix from csv file given column indices. For each column return min and max Parameters dirPath : file path columns : column indexes dtype : data type delim : delemeter """ dtypes = list(map(lambda c : str(c) + ":" + dtype, columns)) dtypes = ",".join(dtypes) #print(dtypes) tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim) minMax = list() ncola = len(tdata[0]) ncole = len(columns) assertEqual(ncola, ncole, "actual no of columns different from expected") for ci in range(ncole): vmin = sys.float_info.max vmax = sys.float_info.min for r in tdata: cv = r[ci] vmin = cv if cv < vmin else vmin vmax = cv if cv > vmax else vmax mm = (vmin, vmax, vmax - vmin) minMax.append(mm) return minMax def getRecAsTypedRecord(rec, types, delim=None): """ converts record to typed records Parameters rec : delemeter separate string or list of string types : field data types delim : delemeter """ if delim is not None: rec = rec.split(delim) (dtypes, cvalues) = extractTypesFromString(types) #print(types) #print(dtypes) trec = list() for ind, value in enumerate(rec): tvalue = __convToTyped(ind, value, dtypes) trec.append(tvalue) return trec def __convToTyped(index, value, dtypes): """ convert to typed value Parameters index : index in type list value : data value dtypes : data type list """ #print(index, value) dtype = dtypes[index] tvalue = value if dtype == "int": tvalue = int(value) elif dtype == "float": tvalue = float(value) return tvalue def extractTypesFromString(types): """ extracts column data types and set values for categorical variables Parameters types : encoded type information """ ftypes = types.split(",") dtypes = dict() cvalues = dict() for ftype in ftypes: items = ftype.split(":") cindex = int(items[0]) dtype = items[1] dtypes[cindex] = dtype if len(items) == 3: sitems = items[2].split() cvalues[cindex] = sitems return (dtypes, cvalues) def getMultipleFileAsInttMatrix(dirPathWithCol, delim=","): """ extracts int matrix from from csv files given column index for each file. num of columns = number of rows in each file and num of rows = number of files Parameters dirPathWithCol: list of file path and collumn index pair delim : delemeter """ mat = list() minLen = -1 for path, col in dirPathWithCol: colVals = getFileColumnAsInt(path, col, delim) if minLen < 0 or len(colVals) < minLen: minLen = len(colVals) mat.append(colVals) #make all same length mat = list(map(lambda li:li[:minLen], mat)) return mat def getMultipleFileAsFloatMatrix(dirPathWithCol, delim=","): """ extracts float matrix from from csv files given column index for each file. num of columns = number of rows in each file and num of rows = number of files Parameters dirPathWithCol: list of file path and collumn index pair delim : delemeter """ mat = list() minLen = -1 for path, col in dirPathWithCol: colVals = getFileColumnAsFloat(path, col, delim) if minLen < 0 or len(colVals) < minLen: minLen = len(colVals) mat.append(colVals) #make all same length mat = list(map(lambda li:li[:minLen], mat)) return mat def writeStrListToFile(ldata, filePath, delem=","): """ writes list of dlem separated string or list of list of string to afile Parameters ldata : list data filePath : file path delim : delemeter """ with open(filePath, "w") as fh: for r in ldata: if type(r) == list: r = delem.join(r) fh.write(r + "\n") def writeFloatListToFile(ldata, prec, filePath): """ writes float list to file, one value per line Parameters ldata : list data prec : precision filePath : file path """ with open(filePath, "w") as fh: for d in ldata: fh.write(formatFloat(prec, d) + "\n") def mutateFileLines(dirPath, mutator, marg, delim=","): """ mutates lines from a file Parameters dirPath : file path mutator : mutation callback marg : argument for mutation call back delim : delemeter """ lines = list() for li in fileRecGen(dirPath, delim): li = mutator(li) if marg is None else mutator(li, marg) lines.append(li) return lines def takeFirst(elems): """ return fisrt item Parameters elems : list of data """ return elems[0] def takeSecond(elems): """ return 2nd element Parameters elems : list of data """ return elems[1] def takeThird(elems): """ returns 3rd element Parameters elems : list of data """ return elems[2] def addToKeyedCounter(dCounter, key, count=1): """ add to to keyed counter Parameters dCounter : dictionary of counters key : dictionary key count : count to add """ curCount = dCounter.get(key, 0) dCounter[key] = curCount + count def incrKeyedCounter(dCounter, key): """ increment keyed counter Parameters dCounter : dictionary of counters key : dictionary key """ addToKeyedCounter(dCounter, key, 1) def appendKeyedList(dList, key, elem): """ keyed list Parameters dList : dictionary of lists key : dictionary key elem : value to append """ curList = dList.get(key, []) curList.append(elem) dList[key] = curList def isNumber(st): """ Returns True is string is a number Parameters st : string value """ return st.replace('.','',1).isdigit() def removeNan(values): """ removes nan from list Parameters values : list data """ return list(filter(lambda v: not math.isnan(v), values)) def fileRecGen(filePath, delim = ","): """ file record generator Parameters filePath ; file path delim : delemeter """ with open(filePath, "r") as fp: for line in fp: line = line[:-1] if delim is not None: line = line.split(delim) yield line def fileSelFieldsRecGen(dirPath, columns, delim=","): """ file record generator given column indices Parameters filePath ; file path columns : column indexes as int array or coma separated string delim : delemeter """ if type(columns) == str: columns = strToIntArray(columns, delim) for rec in fileRecGen(dirPath, delim): extracted = extractList(rec, columns) yield extracted def fileSelFieldValueGen(dirPath, column, delim=","): """ file record generator for a given column Parameters filePath ; file path column : column index delim : delemeter """ for rec in fileRecGen(dirPath, delim): yield rec[column] def fileFiltRecGen(filePath, filt, delim = ","): """ file record generator with row filter applied Parameters filePath ; file path filt : row filter delim : delemeter """ with open(filePath, "r") as fp: for line in fp: line = line[:-1] if delim is not None: line = line.split(delim) if filt(line): yield line def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = ","): """ file record generator with row and column filter applied Parameters filePath ; file path filt : row filter columns : column indexes as int array or coma separated string delim : delemeter """ columns = strToIntArray(columns, delim) with open(filePath, "r") as fp: for line in fp: line = line[:-1] if delim is not None: line = line.split(delim) if filt(line): selected = extractList(line, columns) yield selected def fileTypedRecGen(filePath, ftypes, delim = ","): """ file typed record generator Parameters filePath ; file path ftypes : list of field types delim : delemeter """ with open(filePath, "r") as fp: for line in fp: line = line[:-1] line = line.split(delim) for i in range(0, len(ftypes), 2): ci = ftypes[i] dtype = ftypes[i+1] assertLesser(ci, len(line), "index out of bound") if dtype == "int": line[ci] = int(line[ci]) elif dtype == "float": line[ci] = float(line[ci]) else: exitWithMsg("invalid data type") yield line def fileMutatedFieldsRecGen(dirPath, mutator, delim=","): """ file record generator with some columns mutated Parameters dirPath ; file path mutator : row field mutator delim : delemeter """ for rec in fileRecGen(dirPath, delim): mutated = mutator(rec) yield mutated def tableSelFieldsFilter(tdata, columns): """ gets tabular data for selected columns Parameters tdata : tabular data columns : column indexes """ if areAllFieldsIncluded(tdata[0], columns): ntdata = tdata else: ntdata = list() for rec in tdata: #print(rec) #print(columns) nrec = extractList(rec, columns) ntdata.append(nrec) return ntdata def areAllFieldsIncluded(ldata, columns): """ return True id all indexes are in the columns Parameters ldata : list data columns : column indexes """ return list(range(len(ldata))) == columns def asIntList(items): """ returns int list Parameters items : list data """ return [int(i) for i in items] def asFloatList(items): """ returns float list Parameters items : list data """ return [float(i) for i in items] def pastTime(interval, unit): """ current and past time Parameters interval : time interval unit: time unit """ curTime = int(time.time()) if unit == "d": pastTime = curTime - interval * secInDay elif unit == "h": pastTime = curTime - interval * secInHour elif unit == "m": pastTime = curTime - interval * secInMinute else: raise ValueError("invalid time unit " + unit) return (curTime, pastTime) def minuteAlign(ts): """ minute aligned time Parameters ts : time stamp in sec """ return int((ts / secInMinute)) * secInMinute def multMinuteAlign(ts, min): """ multi minute aligned time Parameters ts : time stamp in sec min : minute value """ intv = secInMinute * min return int((ts / intv)) * intv def hourAlign(ts): """ hour aligned time Parameters ts : time stamp in sec """ return int((ts / secInHour)) * secInHour def hourOfDayAlign(ts, hour): """ hour of day aligned time Parameters ts : time stamp in sec hour : hour of day """ day = int(ts / secInDay) return (24 * day + hour) * secInHour def dayAlign(ts): """ day aligned time Parameters ts : time stamp in sec """ return int(ts / secInDay) * secInDay def timeAlign(ts, unit): """ boundary alignment of time Parameters ts : time stamp in sec unit : unit of time """ alignedTs = 0 if unit == "s": alignedTs = ts elif unit == "m": alignedTs = minuteAlign(ts) elif unit == "h": alignedTs = hourAlign(ts) elif unit == "d": alignedTs = dayAlign(ts) else: raise ValueError("invalid time unit") return alignedTs def monthOfYear(ts): """ month of year Parameters ts : time stamp in sec """ rem = ts % secInYear dow = int(rem / secInMonth) return dow def dayOfWeek(ts): """ day of week Parameters ts : time stamp in sec """ rem = ts % secInWeek dow = int(rem / secInDay) return dow def hourOfDay(ts): """ hour of day Parameters ts : time stamp in sec """ rem = ts % secInDay hod = int(rem / secInHour) return hod def processCmdLineArgs(expectedTypes, usage): """ process command line args and returns args as typed values Parameters expectedTypes : expected data types of arguments usage : usage message string """ args = [] numComLineArgs = len(sys.argv) numExpected = len(expectedTypes) if (numComLineArgs - 1 == len(expectedTypes)): try: for i in range(0, numExpected): if (expectedTypes[i] == typeInt): args.append(int(sys.argv[i+1])) elif (expectedTypes[i] == typeFloat): args.append(float(sys.argv[i+1])) elif (expectedTypes[i] == typeString): args.append(sys.argv[i+1]) except ValueError: print ("expected number of command line arguments found but there is type mis match") sys.exit(1) else: print ("expected number of command line arguments not found") print (usage) sys.exit(1) return args def mutateString(val, numMutate, ctype): """ mutate string multiple times Parameters val : string value numMutate : num of mutations ctype : type of character to mutate with """ mutations = set() count = 0 while count < numMutate: j = randint(0, len(val)-1) if j not in mutations: if ctype == "alpha": ch = selectRandomFromList(alphaTokens) elif ctype == "num": ch = selectRandomFromList(numTokens) elif ctype == "any": ch = selectRandomFromList(tokens) val = val[:j] + ch + val[j+1:] mutations.add(j) count += 1 return val def mutateList(values, numMutate, vmin, vmax, rabs=True): """ mutate list multiple times Parameters values : list value numMutate : num of mutations vmin : minimum of value range vmax : maximum of value range rabs : True if mim max range is absolute otherwise relative """ mutations = set() count = 0 while count < numMutate: j = randint(0, len(values)-1) if j not in mutations: s = np.random.uniform(vmin, vmax) values[j] = s if rabs else values[j] * s count += 1 mutations.add(j) return values def swap(values, first, second): """ swap two elements Parameters values : list value first : first swap position second : second swap position """ t = values[first] values[first] = values[second] values[second] = t def swapBetweenLists(values1, values2): """ swap two elements between 2 lists Parameters values1 : first list of values values2 : second list of values """ p1 = randint(0, len(values1)-1) p2 = randint(0, len(values2)-1) tmp = values1[p1] values1[p1] = values2[p2] values2[p2] = tmp def safeAppend(values, value): """ append only if not None Parameters values : list value value : value to append """ if value is not None: values.append(value) def getAllIndex(ldata, fldata): """ get ALL indexes of list elements Parameters ldata : list data to find index in fldata : list data for values for index look up """ return list(map(lambda e : fldata.index(e), ldata)) def findIntersection(lOne, lTwo): """ find intersection elements between 2 lists Parameters lOne : first list of data lTwo : second list of data """ sOne = set(lOne) sTwo = set(lTwo) sInt = sOne.intersection(sTwo) return list(sInt) def isIntvOverlapped(rOne, rTwo): """ checks overlap between 2 intervals Parameters rOne : first interval boundaries rTwo : second interval boundaries """ clear = rOne[1] <= rTwo[0] or rOne[0] >= rTwo[1] return not clear def isIntvLess(rOne, rTwo): """ checks if first iterval is less than second Parameters rOne : first interval boundaries rTwo : second interval boundaries """ less = rOne[1] <= rTwo[0] return less def findRank(e, values): """ find rank of value in a list Parameters e : value to compare with values : list data """ count = 1 for ve in values: if ve < e: count += 1 return count def findRanks(toBeRanked, values): """ find ranks of values in one list in another list Parameters toBeRanked : list of values for which ranks are found values : list in which rank is found : """ return list(map(lambda e: findRank(e, values), toBeRanked)) def formatFloat(prec, value, label = None): """ formats a float with optional label Parameters prec : precision value : data value label : label for data """ st = (label + " ") if label else "" formatter = "{:." + str(prec) + "f}" return st + formatter.format(value) def formatAny(value, label = None): """ formats any obkect with optional label Parameters value : data value label : label for data """ st = (label + " ") if label else "" return st + str(value) def printList(values): """ pretty print list Parameters values : list of values """ for v in values: print(v) def printMap(values, klab, vlab, precision, offset=16): """ pretty print hash map Parameters values : dictionary of values klab : label for key vlab : label for value precision : precision offset : left justify offset """ print(klab.ljust(offset, " ") + vlab) for k in values.keys(): v = values[k] ks = toStr(k, precision).ljust(offset, " ") vs = toStr(v, precision) print(ks + vs) def printPairList(values, lab1, lab2, precision, offset=16): """ pretty print list of pairs Parameters values : dictionary of values lab1 : first label lab2 : second label precision : precision offset : left justify offset """ print(lab1.ljust(offset, " ") + lab2) for (v1, v2) in values: sv1 = toStr(v1, precision).ljust(offset, " ") sv2 = toStr(v2, precision) print(sv1 + sv2) def createMap(*values): """ create disctionary with results Parameters values : sequence of key value pairs """ result = dict() for i in range(0, len(values), 2): result[values[i]] = values[i+1] return result def getColMinMax(table, col): """ return min, max values of a column Parameters table : tabular data col : column index """ vmin = None vmax = None for rec in table: value = rec[col] if vmin is None: vmin = value vmax = value else: if value < vmin: vmin = value elif value > vmax: vmax = value return (vmin, vmax, vmax - vmin) def createLogger(name, logFilePath, logLevName): """ creates logger Parameters name : logger name logFilePath : log file path logLevName : log level """ logger = logging.getLogger(name) fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4) logLev = logLevName.lower() if logLev == "debug": logLevel = logging.DEBUG elif logLev == "info": logLevel = logging.INFO elif logLev == "warning": logLevel = logging.WARNING elif logLev == "error": logLevel = logging.ERROR elif logLev == "critical": logLevel = logging.CRITICAL else: raise ValueError("invalid log level name " + logLevelName) fHandler.setLevel(logLevel) fFormat = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") fHandler.setFormatter(fFormat) logger.addHandler(fHandler) logger.setLevel(logLevel) return logger @contextmanager def suppressStdout(): """ suppress stdout Parameters """ with open(os.devnull, "w") as devnull: oldStdout = sys.stdout sys.stdout = devnull try: yield finally: sys.stdout = oldStdout def exitWithMsg(msg): """ print message and exit Parameters msg : message """ print(msg + " -- quitting") sys.exit(0) def drawLine(data, yscale=None): """ line plot Parameters data : list data yscale : y axis scale """ plt.plot(data) if yscale: step = int(yscale / 10) step = int(step / 10) * 10 plt.yticks(range(0, yscale, step)) plt.show() def drawPlot(x, y, xlabel, ylabel): """ line plot Parameters x : x values y : y values xlabel : x axis label ylabel : y axis label """ if x is None: x = list(range(len(y))) plt.plot(x,y) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.show() def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label): """ line plot of 2 lines Parameters x : x values y1 : first y values y2 : second y values xlabel : x labbel ylabel : y label y1label : first plot label y2label : second plot label """ plt.plot(x, y1, label = y1label) plt.plot(x, y2, label = y2label) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend() plt.show() def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10): """ draw histogram Parameters ldata : list data myTitle : title myXlabel : x label myYlabel : y label nbins : num of bins """ plt.hist(ldata, bins=nbins, density=True) plt.title(myTitle) plt.xlabel(myXlabel) plt.ylabel(myYlabel) plt.show() def saveObject(obj, filePath): """ saves an object Parameters obj : object filePath : file path for saved object """ with open(filePath, "wb") as outfile: pickle.dump(obj,outfile) def restoreObject(filePath): """ restores an object Parameters filePath : file path to restore object from """ with open(filePath, "rb") as infile: obj = pickle.load(infile) return obj def isNumeric(data): """ true if all elements int or float Parameters data : numeric data list """ if type(data) == list or type(data) == np.ndarray: col = pd.Series(data) else: col = data return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64 def isInteger(data): """ true if all elements int Parameters data : numeric data list """ if type(data) == list or type(data) == np.ndarray: col = pd.Series(data) else: col = data return col.dtype == np.int32 or col.dtype == np.int64 def isFloat(data): """ true if all elements float Parameters data : numeric data list """ if type(data) == list or type(data) == np.ndarray: col = pd.Series(data) else: col = data return col.dtype == np.float32 or col.dtype == np.float64 def isBinary(data): """ true if all elements either 0 or 1 Parameters data : binary data """ re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None) return (re is None) def isCategorical(data): """ true if all elements int or string Parameters data : data value """ re = next((d for d in data if not (type(d) == int or type(d) == str)), None) return (re is None) def assertEqual(value, veq, msg): """ assert equal to Parameters value : value veq : value to be equated with msg : error msg """ assert value == veq , msg def assertGreater(value, vmin, msg): """ assert greater than Parameters value : value vmin : minimum value msg : error msg """ assert value > vmin , msg def assertGreaterEqual(value, vmin, msg): """ assert greater than Parameters value : value vmin : minimum value msg : error msg """ assert value >= vmin , msg def assertLesser(value, vmax, msg): """ assert less than Parameters value : value vmax : maximum value msg : error msg """ assert value < vmax , msg def assertLesserEqual(value, vmax, msg): """ assert less than Parameters value : value vmax : maximum value msg : error msg """ assert value <= vmax , msg def assertWithinRange(value, vmin, vmax, msg): """ assert within range Parameters value : value vmin : minimum value vmax : maximum value msg : error msg """ assert value >= vmin and value <= vmax, msg def assertInList(value, values, msg): """ assert contains in a list Parameters value ; balue to check for inclusion values : list data msg : error msg """ assert value in values, msg def maxListDist(l1, l2): """ maximum list element difference between 2 lists Parameters l1 : first list data l2 : second list data """ dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2)))) return dist def fileLineCount(fPath): """ number of lines ina file Parameters fPath : file path """ with open(fPath) as f: for i, li in enumerate(f): pass return (i + 1) def getAlphaNumCharCount(sdata): """ number of alphabetic and numeric charcters in a string Parameters sdata : string data """ acount = 0 ncount = 0 scount = 0 ocount = 0 assertEqual(type(sdata), str, "input must be string") for c in sdata: if c.isnumeric(): ncount += 1 elif c.isalpha(): acount += 1 elif c.isspace(): scount += 1 else: ocount += 1 r = (acount, ncount, ocount) return r def genPowerSet(cvalues, incEmpty=False): """ generates power set i.e all possible subsets Parameters cvalues : list of categorical values incEmpty : include empty set if True """ ps = list() for cv in cvalues: pse = list() for s in ps: sc = s.copy() sc.add(cv) #print(sc) pse.append(sc) ps.extend(pse) es = set() es.add(cv) ps.append(es) #print(es) if incEmpty: ps.append({}) return ps class StepFunction: """ step function Parameters """ def __init__(self, *values): """ initilizer Parameters values : list of tuples, wich each tuple containing 2 x values and corresponding y value """ self.points = values def find(self, x): """ finds step function value Parameters x : x value """ found = False y = 0 for p in self.points: if (x >= p[0] and x < p[1]): y = p[2] found = True break if not found: l = len(self.points) if (x < self.points[0][0]): y = self.points[0][2] elif (x > self.points[l-1][1]): y = self.points[l-1][2] return y class DummyVarGenerator: """ dummy variable generator for categorical variable """ def __init__(self, rowSize, catValues, trueVal, falseVal, delim=None): """ initilizer Parameters rowSize : row size catValues : dictionary with field index as key and list of categorical values as value trueVal : true value, typically "1" falseval : false value , typically "0" delim : field delemeter """ self.rowSize = rowSize self.catValues = catValues numCatVar = len(catValues) colCount = 0 for v in self.catValues.values(): colCount += len(v) self.newRowSize = rowSize - numCatVar + colCount #print ("new row size {}".format(self.newRowSize)) self.trueVal = trueVal self.falseVal = falseVal self.delim = delim def processRow(self, row): """ encodes categorical variables, returning as delemeter separate dstring or list Parameters row : row either delemeter separated string or list """ if self.delim is not None: rowArr = row.split(self.delim) msg = "row does not have expected number of columns found " + str(len(rowArr)) + " expected " + str(self.rowSize) assert len(rowArr) == self.rowSize, msg else: rowArr = row newRowArr = [] for i in range(len(rowArr)): curVal = rowArr[i] if (i in self.catValues): values = self.catValues[i] for val in values: if val == curVal: newVal = self.trueVal else: newVal = self.falseVal newRowArr.append(newVal) else: newRowArr.append(curVal) assert len(newRowArr) == self.newRowSize, "invalid new row size " + str(len(newRowArr)) + " expected " + str(self.newRowSize) encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr return encRow