Spaces:
Build error
Build error
from collections import defaultdict, Counter | |
from nltk.corpus import stopwords | |
class RuleBasedHeuristic: | |
def __init__(self, sentence=None, corpus=None): | |
self.sentence = sentence | |
self.corpus = corpus | |
def add_contiguous_titlecase_words(self, row): | |
matches = [] | |
dd = defaultdict(list) | |
count = 0 | |
for i, j in zip(row, row[1:]): | |
if j[0] - i[0] == 1: | |
dd[count].append(i[-1] + " " + j[-1]) | |
else: | |
count += 1 | |
for key, value in dd.items(): | |
if len(value) > 1: | |
out = value[0] | |
inter = "" | |
for item in value[1:]: | |
inter += " " + item.split()[-1] | |
matches.append(out + inter) | |
else: | |
matches.extend(value) | |
return matches | |
def augment_using_most_frequent_starting_token(self, N=1): | |
first_token = [] | |
for sentence in self.corpus: | |
first_token.append(sentence.split()[0]) | |
return Counter(first_token).most_common(N) | |
def get_top_tokens(self, top_most_common_ptb=None): | |
out = set(stopwords.words("english")) | |
if top_most_common_ptb: | |
out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)]) | |
return out | |