v-doc_abstractive_mac / preprocess.py
ydin0771's picture
Upload preprocess.py
ba880ef
import time
import os
import random
import json
import pickle
import numpy as np
from tqdm import tqdm
from termcolor import colored
from program_translator import ProgramTranslator #
from config import config
# Print bold tex
def bold(txt):
return colored(str(txt), attrs=["bold"])
# Print bold and colored text
def bcolored(txt, color):
return colored(str(txt), color, attrs=["bold"])
# Write a line to file
def writeline(f, line):
f.write(str(line) + "\n")
# Write a list to file
def writelist(f, l):
writeline(f, ",".join(map(str, l)))
# 2d list to numpy
def vectorize2DList(items, minX=0, minY=0, dtype=np.int):
maxX = max(len(items), minX)
maxY = max([len(item) for item in items] + [minY])
t = np.zeros((maxX, maxY), dtype=dtype)
tLengths = np.zeros((maxX,), dtype=np.int)
for i, item in enumerate(items):
t[i, 0:len(item)] = np.array(item, dtype=dtype)
tLengths[i] = len(item)
return t, tLengths
# 3d list to numpy
def vectorize3DList(items, minX=0, minY=0, minZ=0, dtype=np.int):
maxX = max(len(items), minX)
maxY = max([len(item) for item in items] + [minY])
maxZ = max([len(subitem) for item in items for subitem in item] + [minZ])
t = np.zeros((maxX, maxY, maxZ), dtype=dtype)
tLengths = np.zeros((maxX, maxY), dtype=np.int)
for i, item in enumerate(items):
for j, subitem in enumerate(item):
t[i, j, 0:len(subitem)] = np.array(subitem, dtype=dtype)
tLengths[i, j] = len(subitem)
return t, tLengths
'''
Encodes text into integers. Keeps dictionary between string words (symbols)
and their matching integers. Supports encoding and decoding.
'''
class SymbolDict(object):
def __init__(self, empty=False):
self.padding = "<PAD>"
self.unknown = "<UNK>"
self.start = "<START>"
self.end = "<END>"
self.invalidSymbols = [self.padding, self.unknown, self.start, self.end]
if empty:
self.sym2id = {}
self.id2sym = []
else:
self.sym2id = {self.padding: 0, self.unknown: 1, self.start: 2, self.end: 3}
self.id2sym = [self.padding, self.unknown, self.start, self.end]
self.allSeqs = []
def getNumSymbols(self):
return len(self.sym2id)
def isPadding(self, enc):
return enc == 0
def isUnknown(self, enc):
return enc == 1
def isStart(self, enc):
return enc == 2
def isEnd(self, enc):
return enc == 3
def isValid(self, enc):
return enc < self.getNumSymbols() and enc >= len(self.invalidSymbols)
def resetSeqs(self):
self.allSeqs = []
def addSeq(self, seq):
self.allSeqs += seq
# Call to create the words-to-integers vocabulary after (reading word sequences with addSeq).
def createVocab(self, minCount=0):
counter = {}
for symbol in self.allSeqs:
counter[symbol] = counter.get(symbol, 0) + 1
for symbol in counter:
if counter[symbol] > minCount and (symbol not in self.sym2id):
self.sym2id[symbol] = self.getNumSymbols()
self.id2sym.append(symbol)
# Encodes a symbol. Returns the matching integer.
def encodeSym(self, symbol):
if symbol not in self.sym2id:
symbol = self.unknown
return self.sym2id[symbol]
'''
Encodes a sequence of symbols.
Optionally add start, or end symbols.
Optionally reverse sequence
'''
def encodeSequence(self, decoded, addStart=False, addEnd=False, reverse=False):
if reverse:
decoded.reverse()
if addStart:
decoded = [self.start] + decoded
if addEnd:
decoded = decoded + [self.end]
encoded = [self.encodeSym(symbol) for symbol in decoded]
return encoded
# Decodes an integer into its symbol
def decodeId(self, enc):
return self.id2sym[enc] if enc < self.getNumSymbols() else self.unknown
'''
Decodes a sequence of integers into their symbols.
If delim is given, joins the symbols using delim,
Optionally reverse the resulted sequence
'''
def decodeSequence(self, encoded, delim=None, reverse=False, stopAtInvalid=True):
length = 0
for i in range(len(encoded)):
if not self.isValid(encoded[i]) and stopAtInvalid:
break
length += 1
encoded = encoded[:length]
decoded = [self.decodeId(enc) for enc in encoded]
if reverse:
decoded.reverse()
if delim is not None:
return delim.join(decoded)
return decoded
'''
Preprocesses a given dataset into numpy arrays.
By calling preprocess, the class:
1. Reads the input data files into dictionary.
2. Saves the results jsons in files and loads them instead of parsing input if files exist/
3. Initializes word embeddings to random / GloVe.
4. Optionally filters data according to given filters.
5. Encodes and vectorize the data into numpy arrays.
6. Buckets the data according to the instances length.
'''
class Preprocesser(object):
def __init__(self):
self.questionDict = SymbolDict()
self.answerDict = SymbolDict(empty=True)
self.qaDict = SymbolDict()
self.specificDatasetDicts = None
self.programDict = SymbolDict()
self.programTranslator = ProgramTranslator(self.programDict, 2)
'''
Tokenizes string into list of symbols.
Args:
text: raw string to tokenize.
ignorePuncts: punctuation to ignore
keptPunct: punctuation to keep (as symbol)
endPunct: punctuation to remove if appears at the end
delim: delimiter between symbols
clean: True to replace text in string
replacelistPre: dictionary of replacement to perform on the text before tokanization
replacelistPost: dictionary of replacement to perform on the text after tokanization
'''
# sentence tokenizer
allPunct = ["?", "!", "\\", "/", ")", "(", ".", ",", ";", ":"]
def tokenize(self, text, ignoredPuncts=["?", "!", "\\", "/", ")", "("],
keptPuncts=[".", ",", ";", ":"], endPunct=[">", "<", ":"], delim=" ",
clean=False, replacelistPre=dict(), replacelistPost=dict()):
if clean:
for word in replacelistPre:
origText = text
text = text.replace(word, replacelistPre[word])
if (origText != text):
print(origText)
print(text)
print("")
for punct in endPunct:
if text[-1] == punct:
print(text)
text = text[:-1]
print(text)
print("")
for punct in keptPuncts:
text = text.replace(punct, delim + punct + delim)
for punct in ignoredPuncts:
text = text.replace(punct, "")
ret = text.lower().split(delim)
if clean:
origRet = ret
ret = [replacelistPost.get(word, word) for word in ret]
if origRet != ret:
print(origRet)
print(ret)
ret = [t for t in ret if t != ""]
return ret
# Read class' generated files.
# files interface
def readFiles(self, instancesFilename):
with open(instancesFilename, "r") as inFile:
instances = json.load(inFile)
with open(config.questionDictFile(), "rb") as inFile:
self.questionDict = pickle.load(inFile)
with open(config.answerDictFile(), "rb") as inFile:
self.answerDict = pickle.load(inFile)
with open(config.qaDictFile(), "rb") as inFile:
self.qaDict = pickle.load(inFile)
return instances
'''
Generate class' files. Save json representation of instances and
symbols-to-integers dictionaries.
'''
def writeFiles(self, instances, instancesFilename):
with open(instancesFilename, "w") as outFile:
json.dump(instances, outFile)
with open(config.questionDictFile(), "wb") as outFile:
pickle.dump(self.questionDict, outFile)
with open(config.answerDictFile(), "wb") as outFile:
pickle.dump(self.answerDict, outFile)
with open(config.qaDictFile(), "wb") as outFile:
pickle.dump(self.qaDict, outFile)
# Write prediction json to file and optionally a one-answer-per-line output file
def writePreds(self, res, tier, suffix=""):
if res is None:
return
preds = res["preds"]
sortedPreds = sorted(preds, key=lambda instance: instance["index"])
with open(config.predsFile(tier + suffix), "w") as outFile:
outFile.write(json.dumps(sortedPreds))
with open(config.answersFile(tier + suffix), "w") as outFile:
for instance in sortedPreds:
writeline(outFile, instance["prediction"])
def readPDF(self, instancesFilename):
instances = []
if os.path.exists(instancesFilename):
instances = self.readFiles(instancesFilename)
return instances
def readData(self, datasetFilename, instancesFilename, train):
# data extraction
datasetReader = {
"PDF": self.readPDF
}
return datasetReader[config.dataset](datasetFilename, instancesFilename, train)
def vectorizeData(self, data):
# if "SHARED" tie symbol representations in questions and answers
if config.ansEmbMod == "SHARED":
qDict = self.qaDict
else:
qDict = self.questionDict
encodedQuestion = [qDict.encodeSequence(d["questionSeq"]) for d in data]
question, questionL = vectorize2DList(encodedQuestion)
# pass the whole instances? if heavy then not good
imageId = [d["imageId"] for d in data]
instance = data
return {"question": question,
"questionLength": questionL,
"imageId": imageId
}
# Separates data based on a field length
def lseparator(self, key, lims):
maxI = len(lims)
def separatorFn(x):
v = x[key]
for i, lim in enumerate(lims):
if len(v) < lim:
return i
return maxI
return {"separate": separatorFn, "groupsNum": maxI + 1}
# Buckets data to groups using a separator
def bucket(self, instances, separator):
buckets = [[] for i in range(separator["groupsNum"])]
for instance in instances:
bucketI = separator["separate"](instance)
buckets[bucketI].append(instance)
return [bucket for bucket in buckets if len(bucket) > 0]
# Re-buckets bucket list given a seperator
def rebucket(self, buckets, separator):
res = []
for bucket in buckets:
res += self.bucket(bucket, separator)
return res
# Buckets data based on question / program length
def bucketData(self, data, noBucket=False):
if noBucket:
buckets = [data]
else:
if config.noBucket:
buckets = [data]
elif config.noRebucket:
questionSep = self.lseparator("questionSeq", config.questionLims)
buckets = self.bucket(data, questionSep)
else:
programSep = self.lseparator("programSeq", config.programLims)
questionSep = self.lseparator("questionSeq", config.questionLims)
buckets = self.bucket(data, programSep)
buckets = self.rebucket(buckets, questionSep)
return buckets
'''
Prepares data:
1. Filters data according to above arguments.
2. Takes only a subset of the data based on config.trainedNum / config.testedNum
3. Buckets data according to question / program length
4. Vectorizes data into numpy arrays
'''
def prepareData(self, data, train, filterKey=None, noBucket=False):
filterDefault = {"maxQLength": 0, "maxPLength": 0, "onlyChain": False, "filterOp": 0}
filterTrain = {"maxQLength": config.tMaxQ, "maxPLength": config.tMaxP,
"onlyChain": config.tOnlyChain, "filterOp": config.tFilterOp}
filterVal = {"maxQLength": config.vMaxQ, "maxPLength": config.vMaxP,
"onlyChain": config.vOnlyChain, "filterOp": config.vFilterOp}
filters = {"train": filterTrain, "evalTrain": filterTrain,
"val": filterVal, "test": filterDefault}
if filterKey is None:
fltr = filterDefault
else:
fltr = filters[filterKey]
# split data when finetuning on validation set
if config.trainExtra and config.extraVal and (config.finetuneNum > 0):
if train:
data = data[:config.finetuneNum]
else:
data = data[config.finetuneNum:]
typeFilter = config.typeFilters[fltr["filterOp"]]
# filter specific settings
if fltr["onlyChain"]:
data = [d for d in data if all((len(inputNum) < 2) for inputNum in d["programInputs"])]
if fltr["maxQLength"] > 0:
data = [d for d in data if len(d["questionSeq"]) <= fltr["maxQLength"]]
if fltr["maxPLength"] > 0:
data = [d for d in data if len(d["programSeq"]) <= fltr["maxPLength"]]
if len(typeFilter) > 0:
data = [d for d in data if d["programSeq"][-1] not in typeFilter]
# run on subset of the data. If 0 then use all data
num = config.trainedNum if train else config.testedNum
# retainVal = True to retain same clevr_sample of validation across runs
if (not train) and (not config.retainVal):
random.shuffle(data)
if num > 0:
data = data[:num]
# set number to match dataset size
if train:
config.trainedNum = len(data)
else:
config.testedNum = len(data)
# bucket
buckets = self.bucketData(data, noBucket=noBucket)
# vectorize
return [self.vectorizeData(bucket) for bucket in buckets]
# Prepares all the tiers of a dataset. See prepareData method for further details.
def prepareDataset(self, dataset, noBucket=False):
if dataset is None:
return None
for tier in dataset:
if dataset[tier] is not None:
dataset[tier]["data"] = self.prepareData(dataset[tier]["instances"],
train=dataset[tier]["train"], filterKey=tier,
noBucket=noBucket)
for tier in dataset:
if dataset[tier] is not None:
del dataset[tier]["instances"]
return dataset
# Initializes word embeddings to random uniform / random normal / GloVe.
def initializeWordEmbeddings(self, wordsDict=None, noPadding=False):
# default dictionary to use for embeddings
if wordsDict is None:
wordsDict = self.questionDict
# uniform initialization
if config.wrdEmbUniform:
lowInit = -1.0 * config.wrdEmbScale
highInit = 1.0 * config.wrdEmbScale
embeddings = np.random.uniform(low=lowInit, high=highInit,
size=(wordsDict.getNumSymbols(), config.wrdEmbDim))
# normal initialization
else:
embeddings = config.wrdEmbScale * np.random.randn(wordsDict.getNumSymbols(),
config.wrdEmbDim)
# if wrdEmbRandom = False, use GloVE
counter = 0
if (not config.wrdEmbRandom):
with open(config.wordVectorsFile, 'r') as inFile:
for line in inFile:
line = line.strip().split()
word = line[0].lower()
vector = [float(x) for x in line[1:]]
index = wordsDict.sym2id.get(word)
if index is not None:
embeddings[index] = vector
counter += 1
print(counter)
print(self.questionDict.sym2id)
print(len(self.questionDict.sym2id))
print(self.answerDict.sym2id)
print(len(self.answerDict.sym2id))
print(self.qaDict.sym2id)
print(len(self.qaDict.sym2id))
if noPadding:
return embeddings # no embedding for padding symbol
else:
return embeddings[1:]
'''
Initializes words embeddings for question words and optionally for answer words
(when config.ansEmbMod == "BOTH"). If config.ansEmbMod == "SHARED", tie embeddings for
question and answer same symbols.
'''
def initializeQAEmbeddings(self):
# use same embeddings for questions and answers
if config.ansEmbMod == "SHARED":
qaEmbeddings = self.initializeWordEmbeddings(self.qaDict)
ansMap = np.array([self.qaDict.sym2id[sym] for sym in self.answerDict.id2sym])
embeddings = {"qa": qaEmbeddings, "ansMap": ansMap}
# use different embeddings for questions and answers
else:
qEmbeddings = self.initializeWordEmbeddings(self.questionDict)
aEmbeddings = None
if config.ansEmbMod == "BOTH":
aEmbeddings = self.initializeWordEmbeddings(self.answerDict, noPadding=True)
embeddings = {"q": qEmbeddings, "a": aEmbeddings}
return embeddings
'''
Preprocesses a given dataset into numpy arrays:
1. Reads the input data files into dictionary.
2. Saves the results jsons in files and loads them instead of parsing input if files exist/
3. Initializes word embeddings to random / GloVe.
4. Optionally filters data according to given filters.
5. Encodes and vectorize the data into numpy arrays.
5. Buckets the data according to the instances length.
'''
def preprocessData(self, question, debug=False):
# Read data into json and symbols' dictionaries
print(bold("Loading data..."))
start = time.time()
with open(config.questionDictFile(), "rb") as inFile:
self.questionDict = pickle.load(inFile)
with open(config.qaDictFile(), "rb") as inFile:
self.qaDict = pickle.load(inFile)
with open(config.answerDictFile(), "rb") as inFile:
self.answerDict = pickle.load(inFile)
question = question.replace('?', '').replace(', ', '').lower().split()
encodedQuestion = self.questionDict.encodeSequence(question)
data = {'question': np.array([encodedQuestion]), 'questionLength': np.array([len(encodedQuestion)])}
print("took {:.2f} seconds".format(time.time() - start))
# Initialize word embeddings (random / glove)
print(bold("Loading word vectors..."))
start = time.time()
embeddings = self.initializeQAEmbeddings()
print("took {:.2f} seconds".format(time.time() - start))
answer = 'yes' # DUMMY_ANSWER
self.answerDict.addSeq([answer])
self.qaDict.addSeq([answer])
config.questionWordsNum = self.questionDict.getNumSymbols()
config.answerWordsNum = self.answerDict.getNumSymbols()
return data, embeddings, self.answerDict