Spaces:

ThirdEyeData
/

Duplicate_Records_Prediction

Runtime error

App Files Files Community

Duplicate_Records_Prediction / prediction.py

saritha5

Upload 21 files

c5f0a6a about 2 years ago

raw

history blame contribute delete

5.05 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	import os
	import sys
	import random
	import statistics
	import numpy as np
	import matplotlib.pyplot as plt
	import threading
	import time
	import queue
	sys.path.append(os.path.abspath("../lib"))
	sys.path.append(os.path.abspath("../supv"))
	sys.path.append(os.path.abspath("../text"))
	from util import *
	from sampler import *
	from tnn import *
	from txproc import *

	emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"]


	# In[4]:


	def printNgramVec(ngv):
	"""
	print ngram vector
	"""
	print("ngram vector")
	for i in range(len(ngv)):
	if ngv[i] > 0:
	print("{} {}".format(i, ngv[i]))


	# In[5]:


	def createNegMatch(tdata, ri):
	"""
	create negative match by randomly selecting another record
	"""
	nri = randomInt(0, len(tdata)-1)
	while nri == ri:
	nri = randomInt(0, len(tdata)-1)
	return tdata[nri]


	# In[6]:


	def createNgramCreator():
	""" create ngram creator """
	cng = CharNGram(["lcc", "ucc", "dig"], 3, True)
	spc = ["@", "#", "_", "-", "."]
	cng.addSpChar(spc)
	cng.setWsRepl("$")
	cng.finalize()
	return cng


	# In[7]:


	def getSim(rec, incOutput=True):
	""" get rec pair similarity """
	#print(rec)
	sim = list()
	for i in range(6):
	#print("field " + str(i))
	if i == 3:
	s = levenshteinSimilarity(rec[i],rec[i+6])
	else:
	ngv1 = cng.toMgramCount(rec[i])
	ngv2 = cng.toMgramCount(rec[i+6])
	#printNgramVec(ngv1)
	#printNgramVec(ngv2)
	s = cosineSimilarity(ngv1, ngv2)
	sim.append(s)
	ss = toStrFromList(sim, 6)
	srec = ss + "," + rec[-1] if incOutput else ss
	return srec


	# In[8]:


	class SimThread (threading.Thread):
	""" multi threaded similarity calculation """

	def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):
	""" initialize """
	threading.Thread.__init__(self)
	self.tName = tName
	self.cng = cng
	self.qu = qu
	self.incOutput = incOutput
	self.outQu = outQu
	self.outQuSize = outQuSize

	def run(self):
	""" exeution """
	while not exitFlag:
	rec = dequeue(self.qu, workQuLock)
	if rec is not None:
	srec = getSim(rec, self.incOutput)
	if outQu is None:
	print(srec)
	else:
	enqueue(srec, self.outQu, outQuLock, self.outQuSize)

	def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):
	"""create worker threads """
	threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker)))
	threads = list()
	for tName in threadList:
	thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)
	thread.start()
	threads.append(thread)
	return threads


	def enqueue(rec, qu, quLock, qSize):
	""" enqueue record """
	queued = False
	while not queued:
	quLock.acquire()
	if qu.qsize() < qSize - 1:
	qu.put(rec)
	queued = True
	quLock.release()
	time.sleep(1)

	def dequeue(qu, quLock):
	""" dequeue record """
	rec = None
	quLock.acquire()
	if not qu.empty():
	rec = qu.get()
	quLock.release()

	return rec


	if __name__ == "__main__":
	#multi threading related
	workQuLock = threading.Lock()
	outQuLock = threading.Lock()
	exitFlag = False

	""" predict with neural network model """
	newFilePath = sys.argv[1]
	existFilePath = sys.argv[2]
	nworker = int(sys.argv[3])
	prFile = sys.argv[4]

	regr = FeedForwardNetwork(prFile)
	regr.buildModel()
	cng = createNgramCreator()

	#create threads
	qSize = 100
	workQu = queue.Queue(qSize)
	outQu = queue.Queue(qSize)
	threads = createThreads(nworker, cng, workQu, False, outQu, qSize)

	for nrec in fileRecGen(newFilePath):
	srecs = list()
	ecount = 0
	y_pred = []
	#print("processing ", nrec)
	for erec in fileRecGen(existFilePath):
	rec = nrec.copy()
	rec.extend(erec)
	#print(rec)

	enqueue(rec, workQu, workQuLock, qSize)
	srec = dequeue(outQu, outQuLock)
	if srec is not None:
	srecs.append(strToFloatArray(srec))
	ecount += 1

	#wait til workq queue is drained
	while not workQu.empty():
	pass

	#drain out queue
	while len(srecs) < ecount:
	srec = dequeue(outQu, outQuLock)
	if srec is not None:
	srecs.append(strToFloatArray(srec))
	#predict
	simMax = 0
	sims = FeedForwardNetwork.predict(regr, srecs)
	sims = sims.reshape(sims.shape[0])
	y_pred.append(max(sims))
	#print("{} {:.3f}".format(nrec, y_pred))
	print(nrec, max(y_pred))

	# exitFlag = True

	predict_main()