Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
import os | |
import sys | |
import random | |
import statistics | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import threading | |
import time | |
import queue | |
sys.path.append(os.path.abspath("../lib")) | |
sys.path.append(os.path.abspath("../supv")) | |
sys.path.append(os.path.abspath("../text")) | |
from util import * | |
from sampler import * | |
from tnn import * | |
from txproc import * | |
emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"] | |
# In[4]: | |
def printNgramVec(ngv): | |
""" | |
print ngram vector | |
""" | |
print("ngram vector") | |
for i in range(len(ngv)): | |
if ngv[i] > 0: | |
print("{} {}".format(i, ngv[i])) | |
# In[5]: | |
def createNegMatch(tdata, ri): | |
""" | |
create negative match by randomly selecting another record | |
""" | |
nri = randomInt(0, len(tdata)-1) | |
while nri == ri: | |
nri = randomInt(0, len(tdata)-1) | |
return tdata[nri] | |
# In[6]: | |
def createNgramCreator(): | |
""" create ngram creator """ | |
cng = CharNGram(["lcc", "ucc", "dig"], 3, True) | |
spc = ["@", "#", "_", "-", "."] | |
cng.addSpChar(spc) | |
cng.setWsRepl("$") | |
cng.finalize() | |
return cng | |
# In[7]: | |
def getSim(rec, incOutput=True): | |
""" get rec pair similarity """ | |
#print(rec) | |
sim = list() | |
for i in range(6): | |
#print("field " + str(i)) | |
if i == 3: | |
s = levenshteinSimilarity(rec[i],rec[i+6]) | |
else: | |
ngv1 = cng.toMgramCount(rec[i]) | |
ngv2 = cng.toMgramCount(rec[i+6]) | |
#printNgramVec(ngv1) | |
#printNgramVec(ngv2) | |
s = cosineSimilarity(ngv1, ngv2) | |
sim.append(s) | |
ss = toStrFromList(sim, 6) | |
srec = ss + "," + rec[-1] if incOutput else ss | |
return srec | |
# In[8]: | |
class SimThread (threading.Thread): | |
""" multi threaded similarity calculation """ | |
def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize): | |
""" initialize """ | |
threading.Thread.__init__(self) | |
self.tName = tName | |
self.cng = cng | |
self.qu = qu | |
self.incOutput = incOutput | |
self.outQu = outQu | |
self.outQuSize = outQuSize | |
def run(self): | |
""" exeution """ | |
while not exitFlag: | |
rec = dequeue(self.qu, workQuLock) | |
if rec is not None: | |
srec = getSim(rec, self.incOutput) | |
if outQu is None: | |
print(srec) | |
else: | |
enqueue(srec, self.outQu, outQuLock, self.outQuSize) | |
def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize): | |
"""create worker threads """ | |
threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker))) | |
threads = list() | |
for tName in threadList: | |
thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize) | |
thread.start() | |
threads.append(thread) | |
return threads | |
def enqueue(rec, qu, quLock, qSize): | |
""" enqueue record """ | |
queued = False | |
while not queued: | |
quLock.acquire() | |
if qu.qsize() < qSize - 1: | |
qu.put(rec) | |
queued = True | |
quLock.release() | |
time.sleep(1) | |
def dequeue(qu, quLock): | |
""" dequeue record """ | |
rec = None | |
quLock.acquire() | |
if not qu.empty(): | |
rec = qu.get() | |
quLock.release() | |
return rec | |
if __name__ == "__main__": | |
#multi threading related | |
workQuLock = threading.Lock() | |
outQuLock = threading.Lock() | |
exitFlag = False | |
""" predict with neural network model """ | |
newFilePath = sys.argv[1] | |
existFilePath = sys.argv[2] | |
nworker = int(sys.argv[3]) | |
prFile = sys.argv[4] | |
regr = FeedForwardNetwork(prFile) | |
regr.buildModel() | |
cng = createNgramCreator() | |
#create threads | |
qSize = 100 | |
workQu = queue.Queue(qSize) | |
outQu = queue.Queue(qSize) | |
threads = createThreads(nworker, cng, workQu, False, outQu, qSize) | |
for nrec in fileRecGen(newFilePath): | |
srecs = list() | |
ecount = 0 | |
y_pred = [] | |
#print("processing ", nrec) | |
for erec in fileRecGen(existFilePath): | |
rec = nrec.copy() | |
rec.extend(erec) | |
#print(rec) | |
enqueue(rec, workQu, workQuLock, qSize) | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
ecount += 1 | |
#wait til workq queue is drained | |
while not workQu.empty(): | |
pass | |
#drain out queue | |
while len(srecs) < ecount: | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
#predict | |
simMax = 0 | |
sims = FeedForwardNetwork.predict(regr, srecs) | |
sims = sims.reshape(sims.shape[0]) | |
y_pred.append(max(sims)) | |
#print("{} {:.3f}".format(nrec, y_pred)) | |
print(nrec, max(y_pred)) | |
# exitFlag = True | |
predict_main() | |