Spaces:
Runtime error
Runtime error
File size: 3,501 Bytes
0f14897 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from ngram import NGram
def post_process_template(tB):
if tB.endswith('.') == False:
tB += '.'
return tB
# return tB.split('.')[0] + '.'
def construct_template(words, templateA, if_then=False):
if len(words) == 2:
# template = ['{} <mask> {}.'.format(words[0], words[1])]
templates = [
# '{} is <mask> {}.'.format(words[0], words[1]),
'{} <mask> {}.'.format(words[0], words[1]),
]
elif len(words) == 1:
templates = [
# '{} is <mask>.'.format(words[0]),
'{} <mask>.'.format(words[0])]
elif len(words) == 0:
templates = []
if if_then:
for word in words:
index = templateA.index('<mask>')
templateA = templateA[:index] + word + templateA[index + len('<mask>'):]
templates = ['If ' + templateA + ' then ' + template for template in templates]
return templates
def filter_words(words_prob):
word_count = {}
token1_count = {}
word2_count = {}
ret = []
for words, prob, *_ in words_prob:
filter_this = False
# filter repetitive token
token_count = {}
for word in words:
for token in word.split(' '):
if token in token_count:
filter_this = True
token_count[token] = 1
if filter_this:
prob *= 0.5
# filter repetitive words
if len(words) == 2 and words[0] == words[1]:
continue
# filter repetitive first token
token1 = words[0].split(' ')[0]
if token1 not in token1_count:
token1_count[token1] = 1
else:
token1_count[token1] += 1
prob /= token1_count[token1]
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
prob /= word_count[word]
if len(words) == 2:
if words[1] not in word2_count:
word2_count[words[1]] = 0
word2_count[words[1]] += 1
prob /= word2_count[words[1]]
ret.append([words, prob])
return sorted(ret, key=lambda x: x[1], reverse=True)
import math
from copy import deepcopy
def convert_for_print(arr):
ret = deepcopy(arr)
for i in range(len(ret)):
ret[i][1] = round(ret[i][1], 7)
if len(ret[i]) == 3:
for j in range(len(ret[i][2])):
ret[i][2][j] = round(ret[i][2][j], 7)
return ret
def formalize_tA(tA):
tA = tA.strip()
if tA.endswith('.'):
tA = tA[:-1].strip() + '.'
else:
tA += '.'
tA = tA.replace(' ,', ',')
tA = tA.replace(" '", "'")
return tA
ngram_n = 3
def extract_similar_words(txt, words):
max_word_length = 0
for word in words:
if len(word) > max_word_length:
max_word_length = len(word)
txt_ngrams = []
for i in range(len(txt)):
for j in range(i + ngram_n, min(len(txt), i + max_word_length + 5)):
txt_ngrams.append(txt[i:j].lower())
n = NGram(txt_ngrams, key=lambda x: x.lower(), N=ngram_n)
ret = []
for word in words:
matched_word = n.find(word.lower(), 0.5)
if matched_word is None:
return None
ret.append(matched_word)
return ret
def extract_words(txt, words):
for word in words:
if word not in txt:
return None
return [word.lower() for word in words]
|