orion / src /utils.py
andreslu's picture
Upload utils.py
4e06688
from ngram import NGram
def post_process_template(tB):
if tB.endswith('.') == False:
tB += '.'
return tB
# return tB.split('.')[0] + '.'
def construct_template(words, templateA, if_then=False):
if len(words) >= 2:
templates = ['{} <mask> '.format(words[0])]
for i in range(1, len(words)-1):
templates[0] += '{} <mask> '.format(words[i])
templates[0] += '{}.'.format(words[-1])
elif len(words) == 1:
templates = [
# '{} is <mask>.'.format(words[0]),
'{} <mask>.'.format(words[0])]
elif len(words) == 0:
templates = []
if if_then:
for word in words:
index = templateA.index('<mask>')
templateA = templateA[:index] + word + templateA[index + len('<mask>'):]
templates = ['If ' + templateA + ' then ' + template for template in templates]
return templates
def filter_words(words_prob):
word_count = {}
token1_count = {}
word2_count = {}
ret = []
for words, prob, *_ in words_prob:
filter_this = False
# filter repetitive token
token_count = {}
for word in words:
for token in word.split(' '):
if token in token_count:
filter_this = True
token_count[token] = 1
if filter_this:
prob *= 0.5
# filter repetitive words
if len(words) == 2 and words[0] == words[1]:
continue
# filter repetitive first token
token1 = words[0].split(' ')[0]
if token1 not in token1_count:
token1_count[token1] = 1
else:
token1_count[token1] += 1
prob /= token1_count[token1]
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
prob /= word_count[word]
if len(words) == 2:
if words[1] not in word2_count:
word2_count[words[1]] = 0
word2_count[words[1]] += 1
prob /= word2_count[words[1]]
ret.append([words, prob])
return sorted(ret, key=lambda x: x[1], reverse=True)
import math
from copy import deepcopy
def convert_for_print(arr):
ret = deepcopy(arr)
for i in range(len(ret)):
ret[i][1] = round(ret[i][1], 7)
if len(ret[i]) == 3:
for j in range(len(ret[i][2])):
ret[i][2][j] = round(ret[i][2][j], 7)
return ret
def formalize_tA(tA):
tA = tA.strip()
if tA.endswith('.'):
tA = tA[:-1].strip() + '.'
else:
tA += '.'
tA = tA.replace(' ,', ',')
tA = tA.replace(" '", "'")
return tA
ngram_n = 3
def extract_similar_words(txt, words):
max_word_length = 0
for word in words:
if len(word) > max_word_length:
max_word_length = len(word)
txt_ngrams = []
for i in range(len(txt)):
for j in range(i + ngram_n, min(len(txt), i + max_word_length + 5)):
txt_ngrams.append(txt[i:j].lower())
n = NGram(txt_ngrams, key=lambda x: x.lower(), N=ngram_n)
ret = []
for word in words:
matched_word = n.find(word.lower(), 0.5)
if matched_word is None:
return None
ret.append(matched_word)
return ret
def extract_words(txt, words):
for word in words:
if word not in txt:
return None
return [word.lower() for word in words]